From patchwork Fri Sep 2 08:39:08 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: dengjianbo X-Patchwork-Id: 57274 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 84560385803D for ; Fri, 2 Sep 2022 08:40:14 +0000 (GMT) X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from loongson.cn (mail.loongson.cn [114.242.206.163]) by sourceware.org (Postfix) with ESMTP id 82CB53858C54 for ; Fri, 2 Sep 2022 08:39:36 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 82CB53858C54 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=loongson.cn Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=loongson.cn Received: from 5.5.5 (unknown [10.2.5.5]) by localhost.localdomain (Coremail) with SMTP id AQAAf8AxTWtBwRFjWa8PAA--.5836S3; Fri, 02 Sep 2022 16:39:30 +0800 (CST) From: dengjianbo To: adhemerval.zanella@linaro.org, libc-alpha@sourceware.org, i.swmail@xen0n.name Subject: [PATCH 1/1] LoongArch: Add optimized functions. Date: Fri, 2 Sep 2022 16:39:08 +0800 Message-Id: <20220902083908.2560918-2-dengjianbo@loongson.cn> X-Mailer: git-send-email 2.31.1 In-Reply-To: <20220902083908.2560918-1-dengjianbo@loongson.cn> References: <20220902083908.2560918-1-dengjianbo@loongson.cn> MIME-Version: 1.0 X-CM-TRANSID: AQAAf8AxTWtBwRFjWa8PAA--.5836S3 X-Coremail-Antispam: 1UD129KBjvAXoWfAr1kJFWrurW3tF1UZw4Dtwb_yoW5Xw1xKo W5JF4DJws7W34IkFZxW3ZrJayxuFWxGr12qw40vw48Ja4Fk3W7ArWrCFsY9rWrGrW5ursr uasrJ3ZIk3y5KFn7n29KB7ZKAUJUUUUr529EdanIXcx71UUUUU7v73VFW2AGmfu7bjvjm3 AaLaJ3UjIYCTnIWjp_UUUO97AC8VAFwI0_Xr0_Wr1l1xkIjI8I6I8E6xAIw20EY4v20xva j40_Wr0E3s1l1IIY67AEw4v_Jr0_Jr4l82xGYIkIc2x26280x7IE14v26r18M28IrcIa0x kI8VCY1x0267AKxVWUCVW8JwA2ocxC64kIII0Yj41l84x0c7CEw4AK67xGY2AK021l84AC jcxK6xIIjxv20xvE14v26F1j6w1UM28EF7xvwVC0I7IYx2IY6xkF7I0E14v26r4UJVWxJr 1l84ACjcxK6I8E87Iv67AKxVWxJr0_GcWl84ACjcxK6I8E87Iv6xkF7I0E14v26rxl6s0D M2kKe7AKxVWUXVWUAwAS0I0E0xvYzxvE52x082IY62kv0487Mc02F40EFcxC0VAKzVAqx4 xG6I80ewAv7VC0I7IYx2IY67AKxVWUGVWUXwAv7VC2z280aVAFwI0_Gr0_Cr1lOx8S6xCa FVCjc4AY6r1j6r4UM4x0Y48IcxkI7VAKI48JM4x0x7Aq67IIx4CEVc8vx2IErcIFxwCY1x 0262kKe7AKxVWUAVWUtwCY02Avz4vE-syl42xK82IYc2Ij64vIr41l4I8I3I0E4IkC6x0Y z7v_Jr0_Gr1l4IxYO2xFxVAFwI0_Jrv_JF1lx2IqxVAqx4xG67AKxVWUJVWUGwC20s026x 8GjcxK67AKxVWUGVWUWwC2zVAF1VAY17CE14v26r1q6r43MIIYrxkI7VAKI48JMIIF0xvE 2Ix0cI8IcVAFwI0_Jr0_JF4lIxAIcVC0I7IYx2IY6xkF7I0E14v26r4j6F4UMIIF0xvE42 xK8VAvwI8IcIk0rVWUJVWUCwCI42IY6I8E87Iv67AKxVWUJVW8JwCI42IY6I8E87Iv6xkF 7I0E14v26r4j6r4UJbIYCTnIWIevJa73UjIFyTuYvjfU8GYpUUUUU X-CM-SenderInfo: pghqwyxldqu0o6or00hjvr0hdfq/ X-Spam-Status: No, score=-10.6 required=5.0 tests=BAYES_00, GIT_PATCH_0, KAM_DMARC_STATUS, KAM_SHORT, SCC_10_SHORT_WORD_LINES, SCC_5_SHORT_WORD_LINES, SPF_HELO_PASS, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Cc: dengjianbo , xuchenghua@loongson.cn, joseph_myers@mentor.com, caiyinyu@loongson.cn Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" --- sysdeps/loongarch/lp64/memmove.S | 491 +++++++++++++++++++++++++++++ sysdeps/loongarch/lp64/strchr.S | 145 +++++++++ sysdeps/loongarch/lp64/strchrnul.S | 160 ++++++++++ sysdeps/loongarch/lp64/strcmp.S | 210 ++++++++++++ sysdeps/loongarch/lp64/strncmp.S | 281 +++++++++++++++++ 5 files changed, 1287 insertions(+) create mode 100644 sysdeps/loongarch/lp64/memmove.S create mode 100644 sysdeps/loongarch/lp64/strchr.S create mode 100644 sysdeps/loongarch/lp64/strchrnul.S create mode 100644 sysdeps/loongarch/lp64/strcmp.S create mode 100644 sysdeps/loongarch/lp64/strncmp.S diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S new file mode 100644 index 0000000000..632820404e --- /dev/null +++ b/sysdeps/loongarch/lp64/memmove.S @@ -0,0 +1,491 @@ +/* Assembly implementation of memmove. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifdef _LIBC +#include +#include +#include +#else +#include +#include +#endif + +/* Allow the routine to be named something else if desired. */ +#ifndef MEMMOVE_NAME +#define MEMMOVE_NAME memmove +#endif + +#define LD_64(reg, n) \ + ld.d t0, reg, n; \ + ld.d t1, reg, n+8; \ + ld.d t2, reg, n+16; \ + ld.d t3, reg, n+24; \ + ld.d t4, reg, n+32; \ + ld.d t5, reg, n+40; \ + ld.d t6, reg, n+48; \ + ld.d t7, reg, n+56; + + +#define ST_64(reg, n) \ + st.d t0, reg, n; \ + st.d t1, reg, n+8; \ + st.d t2, reg, n+16; \ + st.d t3, reg, n+24; \ + st.d t4, reg, n+32; \ + st.d t5, reg, n+40; \ + st.d t6, reg, n+48; \ + st.d t7, reg, n+56; + +#define LDST_1024 \ + LD_64(a1, 0); \ + ST_64(a0, 0); \ + LD_64(a1, 64); \ + ST_64(a0, 64); \ + LD_64(a1, 128); \ + ST_64(a0, 128); \ + LD_64(a1, 192); \ + ST_64(a0, 192); \ + LD_64(a1, 256); \ + ST_64(a0, 256); \ + LD_64(a1, 320); \ + ST_64(a0, 320); \ + LD_64(a1, 384); \ + ST_64(a0, 384); \ + LD_64(a1, 448); \ + ST_64(a0, 448); \ + LD_64(a1, 512); \ + ST_64(a0, 512); \ + LD_64(a1, 576); \ + ST_64(a0, 576); \ + LD_64(a1, 640); \ + ST_64(a0, 640); \ + LD_64(a1, 704); \ + ST_64(a0, 704); \ + LD_64(a1, 768); \ + ST_64(a0, 768); \ + LD_64(a1, 832); \ + ST_64(a0, 832); \ + LD_64(a1, 896); \ + ST_64(a0, 896); \ + LD_64(a1, 960); \ + ST_64(a0, 960); + +#define LDST_1024_BACK \ + LD_64(a4, -64); \ + ST_64(a3, -64); \ + LD_64(a4, -128); \ + ST_64(a3, -128); \ + LD_64(a4, -192); \ + ST_64(a3, -192); \ + LD_64(a4, -256); \ + ST_64(a3, -256); \ + LD_64(a4, -320); \ + ST_64(a3, -320); \ + LD_64(a4, -384); \ + ST_64(a3, -384); \ + LD_64(a4, -448); \ + ST_64(a3, -448); \ + LD_64(a4, -512); \ + ST_64(a3, -512); \ + LD_64(a4, -576); \ + ST_64(a3, -576); \ + LD_64(a4, -640); \ + ST_64(a3, -640); \ + LD_64(a4, -704); \ + ST_64(a3, -704); \ + LD_64(a4, -768); \ + ST_64(a3, -768); \ + LD_64(a4, -832); \ + ST_64(a3, -832); \ + LD_64(a4, -896); \ + ST_64(a3, -896); \ + LD_64(a4, -960); \ + ST_64(a3, -960); \ + LD_64(a4, -1024); \ + ST_64(a3, -1024); + +#ifdef ANDROID_CHANGES +LEAF(MEMMOVE_NAME, 0) +#else +LEAF(MEMMOVE_NAME) +#endif + +/* 1st var: dest ptr: void *str1 $r4 a0 */ +/* 2nd var: src ptr: void *str2 $r5 a1 */ +/* 3rd var: size_t num */ +/* t0~t9 registers as temp */ + + add.d a4, a1, a2 + add.d a3, a0, a2 + beq a1, a0, less_1bytes + move t8, a0 + srai.d a6, a2, 4 #num/16 + beqz a6, less_16bytes #num<16 + srai.d a6, a2, 6 #num/64 + bnez a6, more_64bytes #num>64 + srai.d a6, a2, 5 + beqz a6, less_32bytes #num<32 + + ld.d t0, a1, 0 #32. */ + +/* + * ISA: LoongArch64 + * Data Model: lp64 + */ + +/* basic algorithm : + + +. use ld.d and mask for the first 8 bytes or less; + + +. build a1 with 8c with dins; + + +. use xor from a1 and v0 to check if is found; + + +. if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has + one byte is \0, else has no \0 + */ + +#include +#include + +#define L_ADDIU addi.d +#define L_ADDU add.d +#define L_SUBU sub.d + +#define STRCHR strchr +#define MOVN(rd,rs,rt) \ + maskeqz t6, rs, rt;\ + masknez rd, rd, rt;\ + or rd, rd, t6 + +#define MOVN2(rd,rt) \ + masknez rd, rd, rt;\ + or rd, rd, rt + + +/* char * strchr (const char *s1, int c); */ + +LEAF(STRCHR) + .align 6 + + li.w t4, 0x7 + lu12i.w a2, 0x01010 + bstrins.d a1, a1, 15, 8 + andi t0, a0, 0x7 + + ori a2, a2, 0x101 + andn t4, a0, t4 + slli.w t1, t0, 3 + + ld.d t4, t4, 0 + + nor t8, zero, zero + bstrins.d a1, a1, 31, 16 + srl.d t4, t4, t1 + + bstrins.d a1, a1, 63, 32 + bstrins.d a2, a2, 63, 32 + srl.d a7, t8, t1 + + li.w t1, 8 + nor t8, a7, zero + slli.d a3, a2, 7 + or t5, t8, t4 + and t3, a7, a1 + + sub.w t1, t1, t0 + nor a3, a3, zero + xor t2, t5, t3 + sub.d a7, t5, a2 + nor a6, t5, a3 + + sub.d a5, t2, a2 + nor a4, t2, a3 + + and a6, a7, a6 + and a5, a5, a4 + or a7, a6, a5 + bnez a7, L(_mc8_a) + + L_ADDU a0, a0, t1 +L(_aloop): + ld.d t4, a0, 0 + + xor t2, t4, a1 + sub.d a7, t4, a2 + nor a6, t4, a3 + sub.d a5, t2, a2 + + nor a4, t2, a3 + and a6, a7, a6 + and a5, a5, a4 + or a7, a6, a5 + bnez a7, L(_mc8_a) + + ld.d t4, a0, 8 + L_ADDIU a0, a0, 16 + xor t2, t4, a1 + sub.d a7, t4, a2 + nor a6, t4, a3 + sub.d a5, t2, a2 + + nor a4, t2, a3 + and a6, a7, a6 + and a5, a5, a4 + or a7, a6, a5 + beqz a7, L(_aloop) + + L_ADDIU a0, a0, -8 +L(_mc8_a): + + ctz.d t0, a5 + ctz.d t2, a6 + + srli.w t0, t0, 3 + srli.w t2, t2, 3 + sltu t1, t2, t0 + L_ADDU v0, a0, t0 + masknez v0, v0, t1 + jr ra +END(STRCHR) + +#ifndef ANDROID_CHANGES +#ifdef _LIBC +libc_hidden_builtin_def (strchr) +weak_alias (strchr, index) +#endif +#endif diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S new file mode 100644 index 0000000000..dcbfded765 --- /dev/null +++ b/sysdeps/loongarch/lp64/strchrnul.S @@ -0,0 +1,160 @@ +/* Assembly implementation of strchrnul. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +/* + * ISA: LoongArch64 + * Data Model: lp64 + */ + +/* basic algorithm : + + +. use ld.d and mask for the first 8 bytes or less; + + +. build a1 with 8c with dins; + + +. use xor from a1 and v0 to check if is found; + + +. if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has + one byte is \0, else has no \0 + + */ + +#include +#include + + +#define L_ADDIU addi.d +#define L_ADDU add.d +#define L_SUBU sub.d + +#define STRCHRNUL __strchrnul + +#define MOVN(rd,rs,rt) \ + maskeqz t6, rs, rt;\ + masknez rd, rd, rt;\ + or rd, rd, t6 + +#define MOVZ(rd,rs,rt) \ + masknez t6, rs, rt;\ + maskeqz rd, rd, rt;\ + or rd, rd, t6 + + +#define MOVN2(rd,rt) \ + masknez rd, rd, rt;\ + or rd, rd, rt + +/* char * strchrnul (const char *s1, int c); */ + +LEAF(STRCHRNUL) + .align 6 + + li.w t4, 0x7 + lu12i.w a2, 0x01010 + bstrins.d a1, a1, 15, 8 + andi t0, a0, 0x7 + + ori a2, a2, 0x101 + andn t4, a0, t4 + slli.w t1, t0, 3 + + ld.d t4, t4, 0 + + nor t8, zero, zero + bstrins.d a1, a1, 31, 16 + srl.d t4, t4, t1 + + preld 0, a0, 32 + bstrins.d a1, a1, 63, 32 + bstrins.d a2, a2, 63, 32 + srl.d a7, t8, t1 + + nor t8, a7, zero + slli.d a3, a2, 7 + or t5, t8, t4 + and t3, a7, a1 + + nor a3, a3, zero + xor t2, t5, t3 + sub.d a7, t5, a2 + nor a6, t5, a3 + + li.w t1, 8 + sub.d a5, t2, a2 + nor a4, t2, a3 + + and a6, a7, a6 + and a5, a5, a4 + or a7, a6, a5 + bnez a7, L(_mc8_a) + + + sub.w t1, t1, t0 + L_ADDU a0, a0, t1 +L(_aloop): + ld.d t4, a0, 0 + + xor t2, t4, a1 + sub.d a7, t4, a2 + nor a6, t4, a3 + sub.d a5, t2, a2 + + nor a4, t2, a3 + and a6, a7, a6 + and a5, a5, a4 + + or a7, a6, a5 + bnez a7, L(_mc8_a) + + ld.d t4, a0, 8 + L_ADDIU a0, a0, 16 + + xor t2, t4, a1 + sub.d a7, t4, a2 + nor a6, t4, a3 + sub.d a5, t2, a2 + + nor a4, t2, a3 + and a6, a7, a6 + and a5, a5, a4 + + or a7, a6, a5 + beqz a7, L(_aloop) + + L_ADDIU a0, a0, -8 +L(_mc8_a): + + ctz.d t0, a5 + ctz.d t2, a6 + + srli.w t0, t0, 3 + srli.w t2, t2, 3 + slt t1, t0, t2 + + MOVZ(t0,t2,t1) + + L_ADDU v0, a0, t0 + jr ra +END(STRCHRNUL) + +#ifndef ANDROID_CHANGES +#ifdef _LIBC +weak_alias(__strchrnul, strchrnul) +libc_hidden_builtin_def (__strchrnul) +#endif +#endif diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S new file mode 100644 index 0000000000..8160ae432d --- /dev/null +++ b/sysdeps/loongarch/lp64/strcmp.S @@ -0,0 +1,210 @@ +/* Assembly implementation of strcmp. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +/* + * ISA: LoongArch64 + * Data Model: lp64 + */ + +/* basic algorithm : + + +. let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1, + set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1 + + +. if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more + ld other times; + + +. if not, load partial t2 and t3, check if t2 has \0; + + +. then use use ld for t0, ldr for t1, + + +. if partial 8 byte from t1 has \0, compare partial 8 byte from t1 with + 8 byte from t0 with a mask in a7 + + +. if not, ldl other part of t1, compare 8 byte from t1 with 8 byte from + t0 + + +. if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has + one byte is \0, else has no \0 + + +. for partial 8 byte from ldr t3, 0(a0), preload t3 with + 0xffffffffffffffff + */ + +#include +#include + + +#define STRCMP strcmp + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and Results */ +#define src1 a0 +#define src2 a1 +#define result v0 +/* Note: v0 = a0 in lp64 ABI */ + + +/* Internal variable */ +#define data1 t0 +#define data2 t1 +#define has_nul t2 +#define diff t3 +#define syndrome t4 +#define zeroones t5 +#define sevenf t6 +#define pos t7 +#define exchange t8 +#define tmp1 a4 +#define tmp2 a5 +#define tmp3 a6 +#define src1_off a2 +#define src2_off a3 +#define tmp4 a7 + +/* rd <- if rc then ra else rb will destroy tmp3 */ + +#define CONDITIONSEL(rd,rc,ra,rb)\ + masknez tmp3, rb, rc;\ + maskeqz rd, ra, rc;\ + or rd, rd, tmp3 + + + +/* int strcmp (const char *s1, const char *s2); */ + +LEAF(STRCMP) + .align 4 + + xor tmp1, src1, src2 + lu12i.w zeroones, 0x01010 + lu12i.w sevenf, 0x7f7f7 + andi src1_off, src1, 0x7 + ori zeroones, zeroones, 0x101 + ori sevenf, sevenf, 0xf7f + andi tmp1, tmp1, 0x7 + bstrins.d zeroones, zeroones, 63, 32 + bstrins.d sevenf, sevenf, 63, 32 + bnez tmp1, strcmp_misaligned8 + bnez src1_off, strcmp_mutual_align +strcmp_loop_aligned: + ld.d data1, src1, 0 + addi.d src1, src1, 8 + ld.d data2, src2, 0 + addi.d src2, src2, 8 +strcmp_start_realigned: + sub.d tmp1, data1, zeroones + or tmp2, data1, sevenf + xor diff, data1, data2 + andn has_nul, tmp1, tmp2 + or syndrome, diff, has_nul + beqz syndrome, strcmp_loop_aligned + +strcmp_end: + ctz.d pos, syndrome + bstrins.d pos, zero, 2, 0 + srl.d data1, data1, pos + srl.d data2, data2, pos + andi data1, data1, 0xff + andi data2, data2, 0xff + sub.d result, data1, data2 + jr ra +strcmp_mutual_align: + bstrins.d src1, zero, 2, 0 + bstrins.d src2, zero, 2, 0 + slli.d tmp1, src1_off, 0x3 + ld.d data1, src1, 0 + sub.d tmp1, zero, tmp1 + ld.d data2, src2, 0 + addi.d src1, src1, 8 + addi.d src2, src2, 8 + nor tmp2, zero, zero + srl.d tmp2, tmp2, tmp1 + or data1, data1, tmp2 + or data2, data2, tmp2 + b strcmp_start_realigned + +strcmp_misaligned8: + +/* check if ((src1 != 0) && ((src2 == 0) || (src1 < src2))) + then exchange(src1,src2) +*/ + andi src2_off, src2, 0x7 + slt tmp2, src1_off, src2_off + CONDITIONSEL(tmp2,src2_off,tmp2,tmp1) + maskeqz exchange, tmp2, src1_off + xor tmp3, src1, src2 + maskeqz tmp3, tmp3, exchange + xor src1, src1, tmp3 + xor src2, src2, tmp3 + + andi src1_off, src1, 0x7 + beqz src1_off, strcmp_loop_misaligned +strcmp_do_misaligned: + ld.bu data1, src1, 0 + ld.bu data2, src2, 0 + xor tmp3, data1, data2 + addi.d src1, src1, 1 + masknez tmp3, data1, tmp3 + addi.d src2, src2, 1 + beqz tmp3, strcmp_done + andi src1_off, src1, 0x7 + bnez src1_off, strcmp_do_misaligned + +strcmp_loop_misaligned: + andi tmp1, src2, 0xff8 + xori tmp1, tmp1, 0xff8 + beqz tmp1, strcmp_do_misaligned + ld.d data1, src1, 0 + ld.d data2, src2, 0 + addi.d src1, src1, 8 + addi.d src2, src2, 8 + + sub.d tmp1, data1, zeroones + or tmp2, data1, sevenf + xor diff, data1, data2 + andn has_nul, tmp1, tmp2 + or syndrome, diff, has_nul + beqz syndrome, strcmp_loop_misaligned + +strcmp_misalign_end: + ctz.d pos, syndrome + bstrins.d pos, zero, 2, 0 + srl.d data1, data1, pos + srl.d data2, data2, pos + andi data1, data1, 0xff + andi data2, data2, 0xff + sub.d tmp1, data1, data2 + sub.d tmp2, data2, data1 + CONDITIONSEL(result,exchange,tmp2,tmp1) + jr ra + +strcmp_done: + sub.d tmp1, data1, data2 + sub.d tmp2, data2, data1 + CONDITIONSEL(result,exchange,tmp2,tmp1) + jr ra +END(STRCMP) +#ifndef ANDROID_CHANGES +#ifdef _LIBC +libc_hidden_builtin_def (strcmp) +#endif +#endif diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S new file mode 100644 index 0000000000..a72b280170 --- /dev/null +++ b/sysdeps/loongarch/lp64/strncmp.S @@ -0,0 +1,281 @@ +/* Assembly implementation of strncmp. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +/* + * ISA: LoongArch64 + * Data Model: lp64 + */ + +/* basic algorithm : + + +. let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1, + set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1 + + +. if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more + ld other times; + + +. if not, load partial t2 and t3, check if t2 has \0; + + +. then use use ld for t0, ldr for t1, + + +. if partial 8 byte from t1 has \0, compare partial 8 byte from t1 with + 8 byte from t0 with a mask in a7 + + +. if not, ldl other part of t1, compare 8 byte from t1 with 8 byte from + t0 + + +. if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has + one byte is \0, else has no \0 + + +. for partial 8 byte from ldr t3, 0(a0), preload t3 with + 0xffffffffffffffff + + */ + +#include +#include + + +#define STRNCMP strncmp + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and Results */ +#define src1 a0 +#define src2 a1 +#define limit a2 +/* Note: v0 = a0 in lp64 ABI */ +#define result v0 + + +/* Internal variable */ +#define data1 t0 +#define data2 t1 +#define has_nul t2 +#define diff t3 +#define syndrome t4 +#define zeroones t5 +#define sevenf t6 +#define pos t7 +#define exchange t8 +#define tmp1 a5 +#define tmp2 a6 +#define tmp3 a7 +#define src1_off a3 +#define limit_wd a4 + + +/* int strncmp (const char *s1, const char *s2); */ + +LEAF(STRNCMP) + .align 4 + beqz limit, strncmp_ret0 + + xor tmp1, src1, src2 + lu12i.w zeroones, 0x01010 + lu12i.w sevenf, 0x7f7f7 + andi src1_off, src1, 0x7 + ori zeroones, zeroones, 0x101 + andi tmp1, tmp1, 0x7 + ori sevenf, sevenf, 0xf7f + bstrins.d zeroones, zeroones, 63, 32 + bstrins.d sevenf, sevenf, 63, 32 + bnez tmp1, strncmp_misaligned8 + bnez src1_off, strncmp_mutual_align + + addi.d limit_wd, limit, -1 + srli.d limit_wd, limit_wd, 3 + +strncmp_loop_aligned: + ld.d data1, src1, 0 + addi.d src1, src1, 8 + ld.d data2, src2, 0 + addi.d src2, src2, 8 + +strncmp_start_realigned: + addi.d limit_wd, limit_wd, -1 + sub.d tmp1, data1, zeroones + or tmp2, data1, sevenf + xor diff, data1, data2 + andn has_nul, tmp1, tmp2 + srli.d tmp1, limit_wd, 63 + or syndrome, diff, has_nul + or tmp2, syndrome, tmp1 + beqz tmp2, strncmp_loop_aligned + + /* if not reach limit */ + bge limit_wd, zero, strncmp_not_limit + /* if reach limit */ + andi limit, limit, 0x7 + li.w tmp1, 0x8 + sub.d limit, tmp1, limit + slli.d limit, limit, 0x3 + li.d tmp1, -1 + srl.d tmp1, tmp1, limit + and data1, data1, tmp1 + and data2, data2, tmp1 + orn syndrome, syndrome, tmp1 + + +strncmp_not_limit: + ctz.d pos, syndrome + bstrins.d pos, zero, 2, 0 + srl.d data1, data1, pos + srl.d data2, data2, pos + andi data1, data1, 0xff + andi data2, data2, 0xff + sub.d result, data1, data2 + jr ra + + + +strncmp_mutual_align: + bstrins.d src1, zero, 2, 0 + bstrins.d src2, zero, 2, 0 + slli.d tmp1, src1_off, 0x3 + ld.d data1, src1, 0 + ld.d data2, src2, 0 + addi.d src2, src2, 8 + addi.d src1, src1, 8 + + addi.d limit_wd, limit, -1 + andi tmp3, limit_wd, 0x7 + srli.d limit_wd, limit_wd, 3 + add.d limit, limit, src1_off + add.d tmp3, tmp3, src1_off + srli.d tmp3, tmp3, 3 + add.d limit_wd, limit_wd, tmp3 + + sub.d tmp1, zero, tmp1 + nor tmp2, zero, zero + srl.d tmp2, tmp2, tmp1 + or data1, data1, tmp2 + or data2, data2, tmp2 + b strncmp_start_realigned + +strncmp_misaligned8: + li.w tmp1, 0x10 + bge limit, tmp1, strncmp_try_words + +strncmp_byte_loop: + ld.bu data1, src1, 0 + ld.bu data2, src2, 0 + addi.d limit, limit, -1 + xor tmp1, data1, data2 + masknez tmp1, data1, tmp1 + maskeqz tmp1, limit, tmp1 + beqz tmp1, strncmp_done + + ld.bu data1, src1, 1 + ld.bu data2, src2, 1 + addi.d src1, src1, 2 + addi.d src2, src2, 2 + addi.d limit, limit, -1 + xor tmp1, data1, data2 + masknez tmp1, data1, tmp1 + maskeqz tmp1, limit, tmp1 + bnez tmp1, strncmp_byte_loop + + +strncmp_done: + sub.d result, data1, data2 + jr ra + +strncmp_try_words: + srli.d limit_wd, limit, 3 + beqz src1_off, strncmp_do_misaligned + + sub.d src1_off, zero, src1_off + andi src1_off, src1_off, 0x7 + sub.d limit, limit, src1_off + srli.d limit_wd, limit, 0x3 + + +strncmp_page_end_loop: + ld.bu data1, src1, 0 + ld.bu data2, src2, 0 + addi.d src1, src1, 1 + addi.d src2, src2, 1 + xor tmp1, data1, data2 + masknez tmp1, data1, tmp1 + beqz tmp1, strncmp_done + andi tmp1, src1, 0x7 + bnez tmp1, strncmp_page_end_loop +strncmp_do_misaligned: + li.w src1_off, 0x8 + addi.d limit_wd, limit_wd, -1 + blt limit_wd, zero, strncmp_done_loop + +strncmp_loop_misaligned: + andi tmp2, src2, 0xff8 + xori tmp2, tmp2, 0xff8 + beqz tmp2, strncmp_page_end_loop + + ld.d data1, src1, 0 + ld.d data2, src2, 0 + addi.d src1, src1, 8 + addi.d src2, src2, 8 + sub.d tmp1, data1, zeroones + or tmp2, data1, sevenf + xor diff, data1, data2 + andn has_nul, tmp1, tmp2 + or syndrome, diff, has_nul + bnez syndrome, strncmp_not_limit + addi.d limit_wd, limit_wd, -1 + bge limit_wd, zero, strncmp_loop_misaligned + +strncmp_done_loop: + andi limit, limit, 0x7 + beqz limit, strncmp_not_limit + /* Read the last double word */ + /* check if the final part is about to exceed the page */ + andi tmp1, src2, 0x7 + andi tmp2, src2, 0xff8 + add.d tmp1, tmp1, limit + xori tmp2, tmp2, 0xff8 + andi tmp1, tmp1, 0x8 + masknez tmp1, tmp1, tmp2 + bnez tmp1, strncmp_byte_loop + addi.d src1, src1, -8 + addi.d src2, src2, -8 + ldx.d data1, src1, limit + ldx.d data2, src2, limit + sub.d tmp1, data1, zeroones + or tmp2, data1, sevenf + xor diff, data1, data2 + andn has_nul, tmp1, tmp2 + or syndrome, diff, has_nul + bnez syndrome, strncmp_not_limit + +strncmp_ret0: + move result, zero + jr ra + +/* check if ((src1 != 0) && ((src2 == 0) || (src1 < src2))) + then exchange(src1,src2) + */ + + +END(STRNCMP) +#ifndef ANDROID_CHANGES +#ifdef _LIBC +libc_hidden_builtin_def (strncmp) +#endif +#endif