[3/3] Loongarch: Add ifunc support for strncmp{aligned, lsx}

Message ID 20230822021118.3489949-4-dengjianbo@loongson.cn
State New
Headers
Series Add ifunc support for str{nlen, cmp, ncmp} |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Testing passed
redhat-pt-bot/TryBot-still_applies warning Patch no longer applies to master

Commit Message

dengjianbo Aug. 22, 2023, 2:11 a.m. UTC
  Based on the glibc microbenchmark, only a few short inputs with this
strncmp-aligned and strncmp-lsx implementation experience performance
degradation, overall, strncmp-aligned could reduce the runtime 0%-10%
for aligned comparision, 10%-25% for unaligend comparision, strncmp-lsx
could reduce the runtime about 0%-60%.
---
 sysdeps/loongarch/lp64/multiarch/Makefile     |   2 +
 .../lp64/multiarch/ifunc-impl-list.c          |   7 +
 .../loongarch/lp64/multiarch/ifunc-strncmp.h  |  38 +++
 .../lp64/multiarch/strncmp-aligned.S          | 218 ++++++++++++++++++
 .../loongarch/lp64/multiarch/strncmp-lsx.S    | 206 +++++++++++++++++
 sysdeps/loongarch/lp64/multiarch/strncmp.c    |  35 +++
 6 files changed, 506 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strncmp.c
  

Comments

Richard Henderson Aug. 22, 2023, 3:56 a.m. UTC | #1
On 8/21/23 19:11, dengjianbo wrote:
> +L(magic_num):
> +    .align          6
> +    .dword          0x0706050403020100
> +    .dword          0x0f0e0d0c0b0a0908
> +ENTRY_NO_ALIGN(STRNCMP)
> +    beqz            a2, L(ret0)
> +    pcaddi          t0, -5
> +    andi            a3, a0, 0xf
> +    vld             vr2, t0, 0

Why is the data not in .rodata or a mergable constant section?

You can use pcalau12i and %pc_lo12 with vld to place this data anywhere.


r~
  
dengjianbo Aug. 22, 2023, 6:37 a.m. UTC | #2
On 2023-08-22 11:56, Richard Henderson wrote:
>> +L(magic_num):
>> +    .align          6
>> +    .dword          0x0706050403020100
>> +    .dword          0x0f0e0d0c0b0a0908
>> +ENTRY_NO_ALIGN(STRNCMP)
>> +    beqz            a2, L(ret0)
>> +    pcaddi          t0, -5
>> +    andi            a3, a0, 0xf
>> +    vld             vr2, t0, 0
>
> Why is the data not in .rodata or a mergable constant section?
>
> You can use pcalau12i and %pc_lo12 with vld to place this data anywhere.
>
>
> r~ 

Putting the data here is due to the performance. When the vld
instruction is executed, the data will be in the cache, it can
speed up the data loading.
  
Xi Ruoyao Aug. 22, 2023, 11:13 a.m. UTC | #3
On Tue, 2023-08-22 at 14:37 +0800, dengjianbo wrote:

> Putting the data here is due to the performance. When the vld
>  instruction is executed, the data will be in the cache, it can
>  speed up the data loading. 

AFAIK LoongArch CPUs have separate icache and dcache like all modern
CPUs, so this is not valid to me.
  
Xi Ruoyao Aug. 22, 2023, 11:23 a.m. UTC | #4
On Tue, 2023-08-22 at 19:13 +0800, Xi Ruoyao via Libc-alpha wrote:
> On Tue, 2023-08-22 at 14:37 +0800, dengjianbo wrote:
> 
> > Putting the data here is due to the performance. When the vld
> >  instruction is executed, the data will be in the cache, it can
> >  speed up the data loading. 
> 
> AFAIK LoongArch CPUs have separate icache and dcache like all modern
> CPUs, so this is not valid to me.

And even if it can really improve the performance, this is not on the
hot path of the algorithm so we should not use bizarre optimizations
here for marginal improvement.
  
dengjianbo Aug. 23, 2023, 7:25 a.m. UTC | #5
On 2023-08-22 19:23, Xi Ruoyao wrote:
> On Tue, 2023-08-22 at 19:13 +0800, Xi Ruoyao via Libc-alpha wrote:
>> On Tue, 2023-08-22 at 14:37 +0800, dengjianbo wrote:
>>
>>> Putting the data here is due to the performance. When the vld
>>>  instruction is executed, the data will be in the cache, it can
>>>  speed up the data loading. 
>> AFAIK LoongArch CPUs have separate icache and dcache like all modern
>> CPUs, so this is not valid to me.
> And even if it can really improve the performance, this is not on the
> hot path of the algorithm so we should not use bizarre optimizations
> here for marginal improvement.
>
> -- Xi Ruoyao <xry111@xry111.site> School of Aerospace Science and Technology, Xidian University
Thanks for your suggestion. We have changed strcmp and strncmp to put
the data in the rodata section with mergeable flags, and also use pcalau12i
and %pc_lo12 with the vld to get the data.

diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S

index 595472fcda..0b4eee2a98 100644
--- a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
@@ -25,15 +25,11 @@
 
 # define STRNCMP __strncmp_lsx
 
-L(magic_num):
-    .align          6
-    .dword          0x0706050403020100
-    .dword          0x0f0e0d0c0b0a0908
-ENTRY_NO_ALIGN(STRNCMP)
+LEAF(STRNCMP, 6)
     beqz            a2, L(ret0)
-    pcaddi          t0, -5
+    pcalau12i       t0, %pc_hi20(L(INDEX))
     andi            a3, a0, 0xf
-    vld             vr2, t0, 0
+    vld             vr2, t0, %pc_lo12(L(INDEX))
 
     andi            a4, a1, 0xf
     li.d            t2, 16
@@ -202,5 +198,11 @@ L(ret0):
     jr              ra
 END(STRNCMP)
 
+    .section         .rodata.cst16,"M",@progbits,16
+    .align           4
+L(INDEX):
+    .dword           0x0706050403020100
+    .dword           0x0f0e0d0c0b0a0908
+
 libc_hidden_builtin_def (STRNCMP)
 #endif
  

Patch

diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
index d5a500decd..5d7ae7ae73 100644
--- a/sysdeps/loongarch/lp64/multiarch/Makefile
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -14,6 +14,8 @@  sysdep_routines += \
   strchrnul-lasx \
   strcmp-aligned \
   strcmp-lsx \
+  strncmp-aligned \
+  strncmp-lsx \
   memcpy-aligned \
   memcpy-unaligned \
   memmove-unaligned \
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
index 9183b7da24..c8ba87bd81 100644
--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -69,6 +69,13 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_aligned)
 	      )
 
+  IFUNC_IMPL (i, name, strncmp,
+#if !defined __loongarch_soft_float
+	      IFUNC_IMPL_ADD (array, i, strncmp, SUPPORT_LSX, __strncmp_lsx)
+#endif
+	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_aligned)
+	      )
+
   IFUNC_IMPL (i, name, memcpy,
 #if !defined __loongarch_soft_float
               IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx)
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h
new file mode 100644
index 0000000000..1a7dc36ba6
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strncmp.h
@@ -0,0 +1,38 @@ 
+/* Common definition for strncmp ifunc selection.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+#if !defined __loongarch_soft_float
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+#endif
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+#if !defined __loongarch_soft_float
+  if (SUPPORT_LSX)
+    return OPTIMIZE (lsx);
+  else
+#endif
+    return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
new file mode 100644
index 0000000000..e2687fa770
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-aligned.S
@@ -0,0 +1,218 @@ 
+/* Optimized strncmp implementation using basic Loongarch instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define STRNCMP __strncmp_aligned
+#else
+# define STRNCMP strncmp
+#endif
+
+LEAF(STRNCMP, 6)
+    beqz        a2, L(ret0)
+    lu12i.w     a5, 0x01010
+    andi        a3, a0, 0x7
+    ori         a5, a5, 0x101
+
+    andi        a4, a1, 0x7
+    bstrins.d   a5, a5, 63, 32
+    li.d        t7, -1
+    li.d        t8, 8
+
+    addi.d      a2, a2, -1
+    slli.d      a6, a5, 7
+    bne         a3, a4, L(unaligned)
+    bstrins.d   a0, zero, 2, 0
+
+    bstrins.d   a1, zero, 2, 0
+    ld.d        t0, a0, 0
+    ld.d        t1, a1, 0
+    slli.d      t2, a3, 3
+
+
+    sub.d       t5, t8, a3
+    srl.d       t3, t7, t2
+    srl.d       t0, t0, t2
+    srl.d       t1, t1, t2
+
+    orn         t0, t0, t3
+    orn         t1, t1, t3
+    sub.d       t2, t0, a5
+    andn        t3, a6, t0
+
+    and         t2, t2, t3
+    bne         t0, t1, L(al_end)
+    sltu        t4, a2, t5
+    sub.d       a2, a2, t5
+
+L(al_loop):
+    or          t4, t2, t4
+    bnez        t4, L(ret0)
+    ldx.d       t0, a0, t8
+    ldx.d       t1, a1, t8
+
+
+    addi.d      t8, t8, 8
+    sltui       t4, a2, 8
+    addi.d      a2, a2, -8
+    sub.d       t2, t0, a5
+
+    andn        t3, a6, t0
+    and         t2, t2, t3
+    beq         t0, t1, L(al_loop)
+    addi.d      a2, a2, 8
+
+L(al_end):
+    xor         t3, t0, t1
+    or          t2, t2, t3
+    ctz.d       t2, t2
+    srli.d      t4, t2, 3
+
+    bstrins.d   t2, zero, 2, 0
+    srl.d       t0, t0, t2
+    srl.d       t1, t1, t2
+    andi        t0, t0, 0xff
+
+
+    andi        t1, t1, 0xff
+    sltu        t2, a2, t4
+    sub.d       a0, t0, t1
+    masknez     a0, a0, t2
+
+    jr          ra
+L(ret0):
+    move        a0, zero
+    jr          ra
+    nop
+
+L(unaligned):
+    slt         a7, a4, a3
+    xor         t0, a0, a1
+    maskeqz     t0, t0, a7
+    xor         a0, a0, t0
+
+    xor         a1, a1, t0
+    andi        a3, a0, 0x7
+    andi        a4, a1, 0x7
+    bstrins.d   a0, zero, 2, 0
+
+
+    bstrins.d   a1, zero, 2, 0
+    ld.d        t4, a0, 0
+    ld.d        t1, a1, 0
+    slli.d      t2, a3, 3
+
+    slli.d      t3, a4, 3
+    srl.d       t5, t7, t3
+    srl.d       t0, t4, t2
+    srl.d       t1, t1, t3
+
+    orn         t0, t0, t5
+    orn         t1, t1, t5
+    bne         t0, t1, L(not_equal)
+    sub.d       t6, t8, a4
+
+    sub.d       a4, t2, t3
+    sll.d       t2, t7, t2
+    sub.d       t5, t8, a3
+    orn         t4, t4, t2
+
+
+    sub.d       t2, t4, a5
+    andn        t3, a6, t4
+    sltu        t7, a2, t5
+    and         t2, t2, t3
+
+    sub.d       a3, zero, a4
+    or          t2, t2, t7
+    bnez        t2, L(un_end)
+    sub.d       t7, t5, t6
+
+    sub.d       a2, a2, t5
+    sub.d       t6, t8, t7
+L(un_loop):
+    srl.d       t5, t4, a4
+    ldx.d       t4, a0, t8
+
+    ldx.d       t1, a1, t8
+    addi.d      t8, t8, 8
+    sll.d       t0, t4, a3
+    or          t0, t0, t5
+
+
+    bne         t0, t1, L(loop_not_equal)
+    sub.d       t2, t4, a5
+    andn        t3, a6, t4
+    sltui       t5, a2, 8
+
+    and         t2, t2, t3
+    addi.d      a2, a2, -8
+    or          t3, t2, t5
+    beqz        t3, L(un_loop)
+
+    addi.d      a2, a2, 8
+L(un_end):
+    sub.d       t2, t0, a5
+    andn        t3, a6, t0
+    sltu        t5, a2, t6
+
+    and         t2, t2, t3
+    or          t2, t2, t5
+    bnez        t2, L(ret0)
+    ldx.d       t1, a1, t8
+
+
+    srl.d       t0, t4, a4
+    sub.d       a2, a2, t6
+L(not_equal):
+    sub.d       t2, t0, a5
+    andn        t3, a6, t0
+
+    xor         t4, t0, t1
+    and         t2, t2, t3
+    or          t2, t2, t4
+    ctz.d       t2, t2
+
+    bstrins.d   t2, zero, 2, 0
+    srli.d      t4, t2, 3
+    srl.d       t0, t0, t2
+    srl.d       t1, t1, t2
+
+    andi        t0, t0, 0xff
+    andi        t1, t1, 0xff
+    sub.d       t2, t0, t1
+    sub.d       t3, t1, t0
+
+
+    masknez     t0, t2, a7
+    maskeqz     t1, t3, a7
+    sltu        t2, a2, t4
+    or          a0, t0, t1
+
+    masknez     a0, a0, t2
+    jr          ra
+L(loop_not_equal):
+    add.d       a2, a2, t7
+    b           L(not_equal)
+END(STRNCMP)
+
+libc_hidden_builtin_def (STRNCMP)
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
new file mode 100644
index 0000000000..595472fcda
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp-lsx.S
@@ -0,0 +1,206 @@ 
+/* Optimized strncmp implementation using Loongarch LSX instructions.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc) && !defined __loongarch_soft_float
+
+# define STRNCMP __strncmp_lsx
+
+L(magic_num):
+    .align          6
+    .dword          0x0706050403020100
+    .dword          0x0f0e0d0c0b0a0908
+ENTRY_NO_ALIGN(STRNCMP)
+    beqz            a2, L(ret0)
+    pcaddi          t0, -5
+    andi            a3, a0, 0xf
+    vld             vr2, t0, 0
+
+    andi            a4, a1, 0xf
+    li.d            t2, 16
+    bne             a3, a4, L(unaligned)
+    xor             t0, a0, a3
+
+    xor             t1, a1, a4
+    vld             vr0, t0, 0
+    vld             vr1, t1, 0
+    vreplgr2vr.b    vr3, a3
+
+
+    sub.d           t2, t2, a3
+    vadd.b          vr3, vr3, vr2
+    vshuf.b         vr0, vr3, vr0, vr3
+    vshuf.b         vr1, vr3, vr1, vr3
+
+    vseq.b          vr3, vr0, vr1
+    vmin.bu         vr3, vr0, vr3
+    bgeu            t2, a2, L(al_early_end)
+    vsetanyeqz.b    fcc0, vr3
+
+    bcnez           fcc0, L(al_end)
+    add.d           a3, a0, a2
+    addi.d          a4, a3, -1
+    bstrins.d       a4, zero, 3, 0
+
+    sub.d           a2, a3, a4
+L(al_loop):
+    vld             vr0, t0, 16
+    vld             vr1, t1, 16
+    addi.d          t0, t0, 16
+
+
+    addi.d          t1, t1, 16
+    vseq.b          vr3, vr0, vr1
+    vmin.bu         vr3, vr0, vr3
+    beq             t0, a4, L(al_early_end)
+
+    vsetanyeqz.b    fcc0, vr3
+    bceqz           fcc0, L(al_loop)
+L(al_end):
+    vseqi.b         vr3, vr3, 0
+    vfrstpi.b       vr3, vr3, 0
+
+    vshuf.b         vr0, vr0, vr0, vr3
+    vshuf.b         vr1, vr1, vr1, vr3
+    vpickve2gr.bu   t0, vr0, 0
+    vpickve2gr.bu   t1, vr1, 0
+
+    sub.d           a0, t0, t1
+    jr              ra
+L(al_early_end):
+    vreplgr2vr.b    vr4, a2
+    vslt.b          vr4, vr2, vr4
+
+
+    vorn.v          vr3, vr3, vr4
+    b               L(al_end)
+L(unaligned):
+    slt             a5, a3, a4
+    xor             t0, a0, a1
+
+    maskeqz         t0, t0, a5
+    xor             a0, a0, t0
+    xor             a1, a1, t0
+    andi            a3, a0, 0xf
+
+    andi            a4, a1, 0xf
+    xor             t0, a0, a3
+    xor             t1, a1, a4
+    vld             vr0, t0, 0
+
+    vld             vr3, t1, 0
+    sub.d           t2, t2, a3
+    vreplgr2vr.b    vr4, a3
+    vreplgr2vr.b    vr5, a4
+
+
+    vaddi.bu        vr6, vr2, 16
+    vsub.b          vr7, vr4, vr5
+    vsub.b          vr6, vr6, vr7
+    vadd.b          vr4, vr2, vr4
+
+    vshuf.b         vr1, vr3, vr3, vr6
+    vshuf.b         vr0, vr7, vr0, vr4
+    vshuf.b         vr1, vr7, vr1, vr4
+    vseq.b          vr4, vr0, vr1
+
+    vmin.bu         vr4, vr0, vr4
+    bgeu            t2, a2, L(un_early_end)
+    vsetanyeqz.b    fcc0, vr4
+    bcnez           fcc0, L(un_end)
+
+    add.d           a6, a0, a2
+    vslt.b          vr5, vr2, vr5
+    addi.d          a7, a6, -1
+    vor.v           vr3, vr3, vr5
+
+
+    bstrins.d       a7, zero, 3, 0
+    sub.d           a2, a6, a7
+L(un_loop):
+    vld             vr0, t0, 16
+    addi.d          t0, t0, 16
+
+    vsetanyeqz.b    fcc0, vr3
+    bcnez           fcc0, L(has_zero)
+    beq             t0, a7, L(end_with_len)
+    vor.v           vr1, vr3, vr3
+
+    vld             vr3, t1, 16
+    addi.d          t1, t1, 16
+    vshuf.b         vr1, vr3, vr1, vr6
+    vseq.b          vr4, vr0, vr1
+
+    vmin.bu         vr4, vr0, vr4
+    vsetanyeqz.b    fcc0, vr4
+    bceqz           fcc0, L(un_loop)
+L(un_end):
+    vseqi.b         vr4, vr4, 0
+
+
+    vfrstpi.b       vr4, vr4, 0
+    vshuf.b         vr0, vr0, vr0, vr4
+    vshuf.b         vr1, vr1, vr1, vr4
+    vpickve2gr.bu   t0, vr0, 0
+
+    vpickve2gr.bu   t1, vr1, 0
+    sub.d           t2, t0, t1
+    sub.d           t3, t1, t0
+    masknez         t0, t2, a5
+
+    maskeqz         t1, t3, a5
+    or              a0, t0, t1
+    jr              ra
+L(has_zero):
+    vshuf.b         vr1, vr3, vr3, vr6
+
+    vseq.b          vr4, vr0, vr1
+    vmin.bu         vr4, vr0, vr4
+    bne             t0, a7, L(un_end)
+L(un_early_end):
+    vreplgr2vr.b    vr5, a2
+
+    vslt.b          vr5, vr2, vr5
+    vorn.v          vr4, vr4, vr5
+    b               L(un_end)
+L(end_with_len):
+    sub.d           a6, a3, a4
+
+    bgeu            a6, a2, 1f
+    vld             vr4, t1, 16
+1:
+    vshuf.b         vr1, vr4, vr3, vr6
+    vseq.b          vr4, vr0, vr1
+
+    vmin.bu         vr4, vr0, vr4
+    vreplgr2vr.b    vr5, a2
+    vslt.b          vr5, vr2, vr5
+    vorn.v          vr4, vr4, vr5
+
+    b               L(un_end)
+L(ret0):
+    move            a0, zero
+    jr              ra
+END(STRNCMP)
+
+libc_hidden_builtin_def (STRNCMP)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strncmp.c b/sysdeps/loongarch/lp64/multiarch/strncmp.c
new file mode 100644
index 0000000000..af6d0bc4a7
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strncmp.c
@@ -0,0 +1,35 @@ 
+/* Multiple versions of strncmp.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define strncmp __redirect_strncmp
+# include <string.h>
+# undef strncmp
+
+# define SYMBOL_NAME strncmp
+# include "ifunc-strncmp.h"
+
+libc_ifunc_redirected (__redirect_strncmp, strncmp, IFUNC_SELECTOR ());
+
+# ifdef SHARED
+__hidden_ver1 (strncmp, __GI_strncmp, __redirect_strncmp)
+  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strncmp);
+# endif
+#endif