[2/2] Loongarch: Add ifunc support and add different versions of strlen

Message ID 20230801070902.1385953-3-dengjianbo@loongson.cn
State Superseded
Headers
Series Add ifunc support and different versions of strlen |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Testing passed

Commit Message

dengjianbo Aug. 1, 2023, 7:09 a.m. UTC
  1. strlen-lasx is implemeted by LASX simd instructions(256bit)
2. strlen-lsx is implemeted by LSX simd instructions(128bit)
3. strlen-align is implemented by LA basic instructions and never use unaligned memory acess
---
 sysdeps/loongarch/lp64/multiarch/Makefile     |   3 +
 .../lp64/multiarch/ifunc-impl-list.c          |  39 +++++++
 .../loongarch/lp64/multiarch/ifunc-strlen.h   |  36 +++++++
 .../loongarch/lp64/multiarch/strlen-aligned.S | 101 ++++++++++++++++++
 .../loongarch/lp64/multiarch/strlen-lasx.S    |  65 +++++++++++
 sysdeps/loongarch/lp64/multiarch/strlen-lsx.S |  73 +++++++++++++
 sysdeps/loongarch/lp64/multiarch/strlen.c     |  37 +++++++
 sysdeps/loongarch/sys/regdef.h                |  57 ++++++++++
 .../unix/sysv/linux/loongarch/cpu-features.h  |   2 +
 9 files changed, 413 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/multiarch/Makefile
 create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
 create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
 create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen.c
  

Comments

Adhemerval Zanella Netto Aug. 1, 2023, 2:31 p.m. UTC | #1
On 01/08/23 04:09, dengjianbo wrote:
> 1. strlen-lasx is implemeted by LASX simd instructions(256bit)
> 2. strlen-lsx is implemeted by LSX simd instructions(128bit)
> 3. strlen-align is implemented by LA basic instructions and never use unaligned memory acess

Usually optimization routines are added along benchmarks number to show
the expected improvements over different sizes and alignment.

> ---
>  sysdeps/loongarch/lp64/multiarch/Makefile     |   3 +
>  .../lp64/multiarch/ifunc-impl-list.c          |  39 +++++++
>  .../loongarch/lp64/multiarch/ifunc-strlen.h   |  36 +++++++
>  .../loongarch/lp64/multiarch/strlen-aligned.S | 101 ++++++++++++++++++
>  .../loongarch/lp64/multiarch/strlen-lasx.S    |  65 +++++++++++
>  sysdeps/loongarch/lp64/multiarch/strlen-lsx.S |  73 +++++++++++++
>  sysdeps/loongarch/lp64/multiarch/strlen.c     |  37 +++++++
>  sysdeps/loongarch/sys/regdef.h                |  57 ++++++++++
>  .../unix/sysv/linux/loongarch/cpu-features.h  |   2 +
>  9 files changed, 413 insertions(+)
>  create mode 100644 sysdeps/loongarch/lp64/multiarch/Makefile
>  create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
>  create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
>  create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
>  create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
>  create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
>  create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen.c
> 
> diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
> new file mode 100644
> index 0000000000..529a8b6bab
> --- /dev/null
> +++ b/sysdeps/loongarch/lp64/multiarch/Makefile
> @@ -0,0 +1,3 @@
> +ifeq ($(subdir),string)
> +sysdep_routines += strlen-aligned strlen-lsx strlen-lasx
> +endif

One entry per line:

sysdep_routines += \
  strlen-aligned \
  strlen-lsx \
  strlen-lasx \
  # sysdep_routines

> diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
> new file mode 100644
> index 0000000000..b35e41127e
> --- /dev/null
> +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
> @@ -0,0 +1,39 @@
> +/* Enumerate available IFUNC implementations of a function.  LoongArch64 version.
> +   Copyright (C) 2017-2023 Free Software Foundation, Inc.

I think it should be only 2023 here and for other new file as well.

> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <assert.h>
> +#include <string.h>
> +#include <wchar.h>
> +#include <ldsodefs.h>
> +#include <ifunc-impl-list.h>
> +#include <stdio.h>
> +
> +size_t
> +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> +			size_t max)
> +{
> +
> +  size_t i = max;
> +
> +  IFUNC_IMPL (i, name, strlen,
> +	      IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, __strlen_lasx)
> +	      IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx)
> +	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
> +	      )
> +  return i;
> +}
> diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
> new file mode 100644
> index 0000000000..e2b3490f39
> --- /dev/null
> +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
> @@ -0,0 +1,36 @@
> +/* Common definition for strlen implementation.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2017-2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <ldsodefs.h>
> +#include <ifunc-init.h>
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  if (SUPPORT_LASX)
> +    return OPTIMIZE (lasx);
> +  else if (SUPPORT_LSX)
> +    return OPTIMIZE (lsx);
> +  else
> +    return OPTIMIZE (aligned);
> +}
> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
> new file mode 100644
> index 0000000000..b379e978a7
> --- /dev/null
> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
> @@ -0,0 +1,101 @@
> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <sys/regdef.h>
> +#include <sys/asm.h>
> +
> +#if IS_IN (libc)
> +# define STRLEN __strlen_aligned
> +#else
> +# define STRLEN strlen
> +#endif

Is this really an improvement over the generic implementation? It seems to 
use a quite similar strategy.

> +
> +LEAF(STRLEN, 6)
> +    move        a1, a0
> +    bstrins.d   a0, zero, 2, 0
> +    lu12i.w     a2, 0x01010
> +    li.w        t0, -1
> +
> +    ld.d        t2, a0, 0
> +    andi        t1, a1, 0x7
> +    ori         a2, a2, 0x101
> +    slli.d      t1, t1, 3
> +
> +    bstrins.d   a2, a2, 63, 32
> +    sll.d       t1, t0, t1
> +    slli.d      t3, a2, 7
> +    nor         a3, zero, t3
> +
> +    orn         t2, t2, t1
> +    sub.d       t0, t2, a2
> +    nor         t1, t2, a3
> +    and         t0, t0, t1
> +
> +
> +    bnez        t0, L(count_pos)
> +    addi.d      a0, a0, 8
> +L(loop_16_7bit):
> +    ld.d        t2, a0, 0
> +    sub.d       t1, t2, a2
> +
> +    and         t0, t1, t3
> +    bnez        t0, L(more_check)
> +    ld.d        t2, a0, 8
> +    sub.d       t1, t2, a2
> +
> +    and         t0, t1, t3
> +    addi.d      a0, a0, 16
> +    beqz        t0, L(loop_16_7bit)
> +    addi.d      a0, a0, -8
> +
> +L(more_check):
> +    nor         t0, t2, a3
> +    and         t0, t1, t0
> +    bnez        t0, L(count_pos)
> +    addi.d      a0, a0, 8
> +
> +
> +L(loop_16_8bit):
> +    ld.d        t2, a0, 0
> +    sub.d       t1, t2, a2
> +    nor         t0, t2, a3
> +    and         t0, t0, t1
> +
> +    bnez        t0, L(count_pos)
> +    ld.d        t2, a0, 8
> +    addi.d      a0, a0, 16
> +    sub.d       t1, t2, a2
> +
> +    nor         t0, t2, a3
> +    and         t0, t0, t1
> +    beqz        t0, L(loop_16_8bit)
> +    addi.d      a0, a0, -8
> +
> +L(count_pos):
> +    ctz.d       t1, t0
> +    sub.d       a0, a0, a1
> +    srli.d      t1, t1, 3
> +    add.d       a0, a0, t1
> +
> +    jr          ra
> +END(STRLEN)
> +
> +#ifdef _LIBC
> +libc_hidden_builtin_def (STRLEN)
> +#endif
> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
> new file mode 100644
> index 0000000000..56ac6403d3
> --- /dev/null
> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
> @@ -0,0 +1,65 @@

Missing one line comment.

> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <sys/regdef.h>
> +#include <sys/asm.h>
> +
> +#if IS_IN (libc)
> +
> +# define STRLEN __strlen_lasx
> +
> +LEAF(STRLEN, 6)
> +    move            a1, a0
> +    bstrins.d       a0, zero, 4, 0
> +    li.d            t1, -1
> +    xvld            xr0, a0, 0
> +
> +    xvmsknz.b       xr0, xr0
> +    xvpickve.w      xr1, xr0, 4
> +    vilvl.h         vr0, vr1, vr0
> +    movfr2gr.s      t0, fa0  # sign extend
> +
> +    sra.w           t0, t0, a1
> +    beq             t0, t1, L(loop)
> +    cto.w           a0, t0
> +    jr              ra
> +
> +L(loop):
> +    xvld            xr0, a0, 32
> +    addi.d          a0, a0, 32
> +    xvsetanyeqz.b   fcc0, xr0
> +    bceqz           fcc0, L(loop)
> +
> +
> +    xvmsknz.b       xr0, xr0
> +    sub.d           a0, a0, a1
> +    xvpickve.w      xr1, xr0, 4
> +    vilvl.h         vr0, vr1, vr0
> +
> +    movfr2gr.s      t0, fa0
> +    cto.w           t0, t0
> +    add.d           a0, a0, t0
> +    jr              ra
> +END(STRLEN)
> +
> +#ifdef _LIBC
> +libc_hidden_builtin_def (STRLEN)
> +#endif
> +
> +#endif

This implementation fails to assembler with binutils 2.40.0.20230525:

../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:31: Error: no match insn: xvld        $xr0,$r4,0
../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:33: Error: no match insn: xvmsknz.b   $xr0,$xr0
../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:34: Error: no match insn: xvpickve.w  $xr1,$xr0,4
../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:35: Error: no match insn: vilvl.h     $vr0,$vr1,$vr0
../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:44: Error: no match insn: xvld        $xr0,$r4,32
../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:46: Error: no match insn: xvsetanyeqz.b       $fcc0,$xr0
../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:50: Error: no match insn: xvmsknz.b   $xr0,$xr0
../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:52: Error: no match insn: xvpickve.w  $xr1,$xr0,4
../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:53: Error: no match insn: vilvl.h     $vr0,$vr1,$vr0

You need to either add a configure option to increase the minimum required
binutils or add a macro to synthetize the instruction on older binutils
(similar to what sysdeps/powerpc/powerpc64/le/power9/strncmp.S does).


> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
> new file mode 100644
> index 0000000000..1c19c98b5b
> --- /dev/null
> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
> @@ -0,0 +1,73 @@
> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <sys/regdef.h>
> +#include <sys/asm.h>
> +
> +#if IS_IN (libc)
> +
> +# define STRLEN __strlen_lsx
> +
> +LEAF(STRLEN, 6)
> +    move            a1, a0
> +    bstrins.d       a0, zero, 4, 0
> +    vld             vr0, a0, 0
> +    vld             vr1, a0, 16
> +
> +    li.d            t1, -1
> +    vmsknz.b        vr0, vr0
> +    vmsknz.b        vr1, vr1
> +    vilvl.h         vr0, vr1, vr0
> +
> +    movfr2gr.s      t0, fa0
> +    sra.w           t0, t0, a1
> +    beq             t0, t1, L(loop)
> +    cto.w           a0, t0
> +
> +    jr              ra
> +    nop
> +    nop
> +    nop
> +
> +
> +L(loop):
> +    vld             vr0, a0, 32
> +    vld             vr1, a0, 48
> +    addi.d          a0, a0, 32
> +    vmin.bu         vr2, vr0, vr1
> +
> +    vsetanyeqz.b    fcc0, vr2
> +    bceqz           fcc0, L(loop)
> +    vmsknz.b        vr0, vr0
> +    vmsknz.b        vr1, vr1
> +
> +    vilvl.h         vr0, vr1, vr0
> +    sub.d           a0, a0, a1
> +    movfr2gr.s      t0, fa0
> +    cto.w           t0, t0
> +
> +    add.d           a0, a0, t0
> +    jr              ra
> +END(STRLEN)
> +
> +#ifdef _LIBC
> +libc_hidden_builtin_def (STRLEN)
> +#endif
> +
> +#endif

This implementation fails to assembler with binutils 2.40.0.20230525:

../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages:
../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld  $vr0,$r4,0
../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld  $vr1,$r4,16
../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:34: Error: no match insn: vmsknz.b     $vr0,$vr0
../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:35: Error: no match insn: vmsknz.b     $vr1,$vr1
../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:36: Error: no match insn: vilvl.h      $vr0,$vr1,$vr0
../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:50: Error: no match insn: vld  $vr0,$r4,32
../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:51: Error: no match insn: vld  $vr1,$r4,48
../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:53: Error: no match insn: vmin.bu      $vr2,$vr0,$vr1
../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:55: Error: no match insn: vsetanyeqz.b $fcc0,$vr2
../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:57: Error: no match insn: vmsknz.b     $vr0,$vr0
../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:58: Error: no match insn: vmsknz.b     $vr1,$vr1
../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:60: Error: no match insn: vilvl.h      $vr0,$vr1,$vr0

> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen.c b/sysdeps/loongarch/lp64/multiarch/strlen.c
> new file mode 100644
> index 0000000000..416ed0d9e2
> --- /dev/null
> +++ b/sysdeps/loongarch/lp64/multiarch/strlen.c
> @@ -0,0 +1,37 @@
> +/* Multiple versions of strlen.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2017-2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +
> +#if IS_IN (libc)
> +# define strlen __redirect_strlen
> +# include <string.h>
> +# undef strlen
> +
> +# define SYMBOL_NAME strlen
> +# include "ifunc-strlen.h"
> +
> +libc_ifunc_redirected (__redirect_strlen, strlen, IFUNC_SELECTOR ());
> +
> +# ifdef SHARED
> +__hidden_ver1 (strlen, __GI_strlen, __redirect_strlen)
> +  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlen);
> +# endif
> +
> +#endif
> diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h
> index 5100f36d24..524d2e3277 100644
> --- a/sysdeps/loongarch/sys/regdef.h
> +++ b/sysdeps/loongarch/sys/regdef.h
> @@ -89,6 +89,14 @@
>  #define fs5 $f29
>  #define fs6 $f30
>  #define fs7 $f31
> +#define fcc0 $fcc0
> +#define fcc1 $fcc1
> +#define fcc2 $fcc2
> +#define fcc3 $fcc3
> +#define fcc4 $fcc4
> +#define fcc5 $fcc5
> +#define fcc6 $fcc6
> +#define fcc7 $fcc7
>  
>  #define vr0 $vr0
>  #define vr1 $vr1
> @@ -98,6 +106,30 @@
>  #define vr5 $vr5
>  #define vr6 $vr6
>  #define vr7 $vr7
> +#define vr8 $vr8
> +#define vr9 $vr9
> +#define vr10 $vr10
> +#define vr11 $vr11
> +#define vr12 $vr12
> +#define vr13 $vr13
> +#define vr14 $vr14
> +#define vr15 $vr15
> +#define vr16 $vr16
> +#define vr17 $vr17
> +#define vr18 $vr18
> +#define vr19 $vr19
> +#define vr20 $vr20
> +#define vr21 $vr21
> +#define vr22 $vr22
> +#define vr23 $vr23
> +#define vr24 $vr24
> +#define vr25 $vr25
> +#define vr26 $vr26
> +#define vr27 $vr27
> +#define vr28 $vr28
> +#define vr29 $vr29
> +#define vr30 $vr30
> +#define vr31 $vr31
>  
>  #define xr0 $xr0
>  #define xr1 $xr1
> @@ -107,5 +139,30 @@
>  #define xr5 $xr5
>  #define xr6 $xr6
>  #define xr7 $xr7
> +#define xr7 $xr7
> +#define xr8 $xr8
> +#define xr9 $xr9
> +#define xr10 $xr10
> +#define xr11 $xr11
> +#define xr12 $xr12
> +#define xr13 $xr13
> +#define xr14 $xr14
> +#define xr15 $xr15
> +#define xr16 $xr16
> +#define xr17 $xr17
> +#define xr18 $xr18
> +#define xr19 $xr19
> +#define xr20 $xr20
> +#define xr21 $xr21
> +#define xr22 $xr22
> +#define xr23 $xr23
> +#define xr24 $xr24
> +#define xr25 $xr25
> +#define xr26 $xr26
> +#define xr27 $xr27
> +#define xr28 $xr28
> +#define xr29 $xr29
> +#define xr30 $xr30
> +#define xr31 $xr31
>  
>  #endif /* _SYS_REGDEF_H */
> diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
> index e371e13b15..d1a280a5ee 100644
> --- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
> +++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
> @@ -25,5 +25,7 @@
>  #define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
>  #define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
>  
> +#define INIT_ARCH()
> +
>  #endif /* _CPU_FEATURES_LOONGARCH64_H  */
>
  
Xi Ruoyao Aug. 1, 2023, 2:44 p.m. UTC | #2
On Tue, 2023-08-01 at 15:09 +0800, dengjianbo wrote:

/* snip */

> diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile
> b/sysdeps/loongarch/lp64/multiarch/Makefile
> new file mode 100644
> index 0000000000..529a8b6bab
> --- /dev/null
> +++ b/sysdeps/loongarch/lp64/multiarch/Makefile
> @@ -0,0 +1,3 @@
> +ifeq ($(subdir),string)
> +sysdep_routines += strlen-aligned strlen-lsx strlen-lasx
> +endif

Please check if the assembler supports LSX/LASX, if not you should not
add strlen-lsx and strlen-lasx here.  We don't want to disallow building
Glibc for LoongArch with old assembler.
  
caiyinyu Aug. 2, 2023, 1:25 a.m. UTC | #3
在 2023/8/1 下午10:31, Adhemerval Zanella Netto 写道:
>
> On 01/08/23 04:09, dengjianbo wrote:
>> 1. strlen-lasx is implemeted by LASX simd instructions(256bit)
>> 2. strlen-lsx is implemeted by LSX simd instructions(128bit)
>> 3. strlen-align is implemented by LA basic instructions and never use unaligned memory acess
> Usually optimization routines are added along benchmarks number to show
> the expected improvements over different sizes and alignment.

The performance test plots for these functions over different sizes and 
alignment are here:
https://github.com/jiadengx/glibc_test/blob/main/strlen/strlen_align.png
https://github.com/jiadengx/glibc_test/blob/main/strlen/strlen_lasx.png
https://github.com/jiadengx/glibc_test/blob/main/strlen/strlen_lsx.png


>
>> ---
>>   sysdeps/loongarch/lp64/multiarch/Makefile     |   3 +
>>   .../lp64/multiarch/ifunc-impl-list.c          |  39 +++++++
>>   .../loongarch/lp64/multiarch/ifunc-strlen.h   |  36 +++++++
>>   .../loongarch/lp64/multiarch/strlen-aligned.S | 101 ++++++++++++++++++
>>   .../loongarch/lp64/multiarch/strlen-lasx.S    |  65 +++++++++++
>>   sysdeps/loongarch/lp64/multiarch/strlen-lsx.S |  73 +++++++++++++
>>   sysdeps/loongarch/lp64/multiarch/strlen.c     |  37 +++++++
>>   sysdeps/loongarch/sys/regdef.h                |  57 ++++++++++
>>   .../unix/sysv/linux/loongarch/cpu-features.h  |   2 +
>>   9 files changed, 413 insertions(+)
>>   create mode 100644 sysdeps/loongarch/lp64/multiarch/Makefile
>>   create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
>>   create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
>>   create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
>>   create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
>>   create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
>>   create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen.c
>>
>> diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
>> new file mode 100644
>> index 0000000000..529a8b6bab
>> --- /dev/null
>> +++ b/sysdeps/loongarch/lp64/multiarch/Makefile
>> @@ -0,0 +1,3 @@
>> +ifeq ($(subdir),string)
>> +sysdep_routines += strlen-aligned strlen-lsx strlen-lasx
>> +endif
> One entry per line:
>
> sysdep_routines += \
>    strlen-aligned \
>    strlen-lsx \
>    strlen-lasx \
>    # sysdep_routines
>
>> diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
>> new file mode 100644
>> index 0000000000..b35e41127e
>> --- /dev/null
>> +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
>> @@ -0,0 +1,39 @@
>> +/* Enumerate available IFUNC implementations of a function.  LoongArch64 version.
>> +   Copyright (C) 2017-2023 Free Software Foundation, Inc.
> I think it should be only 2023 here and for other new file as well.
>
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#include <assert.h>
>> +#include <string.h>
>> +#include <wchar.h>
>> +#include <ldsodefs.h>
>> +#include <ifunc-impl-list.h>
>> +#include <stdio.h>
>> +
>> +size_t
>> +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> +			size_t max)
>> +{
>> +
>> +  size_t i = max;
>> +
>> +  IFUNC_IMPL (i, name, strlen,
>> +	      IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, __strlen_lasx)
>> +	      IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx)
>> +	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
>> +	      )
>> +  return i;
>> +}
>> diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
>> new file mode 100644
>> index 0000000000..e2b3490f39
>> --- /dev/null
>> +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
>> @@ -0,0 +1,36 @@
>> +/* Common definition for strlen implementation.
>> +   All versions must be listed in ifunc-impl-list.c.
>> +   Copyright (C) 2017-2023 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#include <ldsodefs.h>
>> +#include <ifunc-init.h>
>> +
>> +extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
>> +extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
>> +extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
>> +
>> +static inline void *
>> +IFUNC_SELECTOR (void)
>> +{
>> +  if (SUPPORT_LASX)
>> +    return OPTIMIZE (lasx);
>> +  else if (SUPPORT_LSX)
>> +    return OPTIMIZE (lsx);
>> +  else
>> +    return OPTIMIZE (aligned);
>> +}
>> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
>> new file mode 100644
>> index 0000000000..b379e978a7
>> --- /dev/null
>> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
>> @@ -0,0 +1,101 @@
>> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library.  If not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#include <sysdep.h>
>> +#include <sys/regdef.h>
>> +#include <sys/asm.h>
>> +
>> +#if IS_IN (libc)
>> +# define STRLEN __strlen_aligned
>> +#else
>> +# define STRLEN strlen
>> +#endif
> Is this really an improvement over the generic implementation? It seems to
> use a quite similar strategy.
>
>> +
>> +LEAF(STRLEN, 6)
>> +    move        a1, a0
>> +    bstrins.d   a0, zero, 2, 0
>> +    lu12i.w     a2, 0x01010
>> +    li.w        t0, -1
>> +
>> +    ld.d        t2, a0, 0
>> +    andi        t1, a1, 0x7
>> +    ori         a2, a2, 0x101
>> +    slli.d      t1, t1, 3
>> +
>> +    bstrins.d   a2, a2, 63, 32
>> +    sll.d       t1, t0, t1
>> +    slli.d      t3, a2, 7
>> +    nor         a3, zero, t3
>> +
>> +    orn         t2, t2, t1
>> +    sub.d       t0, t2, a2
>> +    nor         t1, t2, a3
>> +    and         t0, t0, t1
>> +
>> +
>> +    bnez        t0, L(count_pos)
>> +    addi.d      a0, a0, 8
>> +L(loop_16_7bit):
>> +    ld.d        t2, a0, 0
>> +    sub.d       t1, t2, a2
>> +
>> +    and         t0, t1, t3
>> +    bnez        t0, L(more_check)
>> +    ld.d        t2, a0, 8
>> +    sub.d       t1, t2, a2
>> +
>> +    and         t0, t1, t3
>> +    addi.d      a0, a0, 16
>> +    beqz        t0, L(loop_16_7bit)
>> +    addi.d      a0, a0, -8
>> +
>> +L(more_check):
>> +    nor         t0, t2, a3
>> +    and         t0, t1, t0
>> +    bnez        t0, L(count_pos)
>> +    addi.d      a0, a0, 8
>> +
>> +
>> +L(loop_16_8bit):
>> +    ld.d        t2, a0, 0
>> +    sub.d       t1, t2, a2
>> +    nor         t0, t2, a3
>> +    and         t0, t0, t1
>> +
>> +    bnez        t0, L(count_pos)
>> +    ld.d        t2, a0, 8
>> +    addi.d      a0, a0, 16
>> +    sub.d       t1, t2, a2
>> +
>> +    nor         t0, t2, a3
>> +    and         t0, t0, t1
>> +    beqz        t0, L(loop_16_8bit)
>> +    addi.d      a0, a0, -8
>> +
>> +L(count_pos):
>> +    ctz.d       t1, t0
>> +    sub.d       a0, a0, a1
>> +    srli.d      t1, t1, 3
>> +    add.d       a0, a0, t1
>> +
>> +    jr          ra
>> +END(STRLEN)
>> +
>> +#ifdef _LIBC
>> +libc_hidden_builtin_def (STRLEN)
>> +#endif
>> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
>> new file mode 100644
>> index 0000000000..56ac6403d3
>> --- /dev/null
>> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
>> @@ -0,0 +1,65 @@
> Missing one line comment.
>
>> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library.  If not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#include <sysdep.h>
>> +#include <sys/regdef.h>
>> +#include <sys/asm.h>
>> +
>> +#if IS_IN (libc)
>> +
>> +# define STRLEN __strlen_lasx
>> +
>> +LEAF(STRLEN, 6)
>> +    move            a1, a0
>> +    bstrins.d       a0, zero, 4, 0
>> +    li.d            t1, -1
>> +    xvld            xr0, a0, 0
>> +
>> +    xvmsknz.b       xr0, xr0
>> +    xvpickve.w      xr1, xr0, 4
>> +    vilvl.h         vr0, vr1, vr0
>> +    movfr2gr.s      t0, fa0  # sign extend
>> +
>> +    sra.w           t0, t0, a1
>> +    beq             t0, t1, L(loop)
>> +    cto.w           a0, t0
>> +    jr              ra
>> +
>> +L(loop):
>> +    xvld            xr0, a0, 32
>> +    addi.d          a0, a0, 32
>> +    xvsetanyeqz.b   fcc0, xr0
>> +    bceqz           fcc0, L(loop)
>> +
>> +
>> +    xvmsknz.b       xr0, xr0
>> +    sub.d           a0, a0, a1
>> +    xvpickve.w      xr1, xr0, 4
>> +    vilvl.h         vr0, vr1, vr0
>> +
>> +    movfr2gr.s      t0, fa0
>> +    cto.w           t0, t0
>> +    add.d           a0, a0, t0
>> +    jr              ra
>> +END(STRLEN)
>> +
>> +#ifdef _LIBC
>> +libc_hidden_builtin_def (STRLEN)
>> +#endif
>> +
>> +#endif
> This implementation fails to assembler with binutils 2.40.0.20230525:
>
> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:31: Error: no match insn: xvld        $xr0,$r4,0
> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:33: Error: no match insn: xvmsknz.b   $xr0,$xr0
> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:34: Error: no match insn: xvpickve.w  $xr1,$xr0,4
> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:35: Error: no match insn: vilvl.h     $vr0,$vr1,$vr0
> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:44: Error: no match insn: xvld        $xr0,$r4,32
> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:46: Error: no match insn: xvsetanyeqz.b       $fcc0,$xr0
> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:50: Error: no match insn: xvmsknz.b   $xr0,$xr0
> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:52: Error: no match insn: xvpickve.w  $xr1,$xr0,4
> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:53: Error: no match insn: vilvl.h     $vr0,$vr1,$vr0
>
> You need to either add a configure option to increase the minimum required
> binutils or add a macro to synthetize the instruction on older binutils
> (similar to what sysdeps/powerpc/powerpc64/le/power9/strncmp.S does).
>
>
>> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
>> new file mode 100644
>> index 0000000000..1c19c98b5b
>> --- /dev/null
>> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
>> @@ -0,0 +1,73 @@
>> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library.  If not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#include <sysdep.h>
>> +#include <sys/regdef.h>
>> +#include <sys/asm.h>
>> +
>> +#if IS_IN (libc)
>> +
>> +# define STRLEN __strlen_lsx
>> +
>> +LEAF(STRLEN, 6)
>> +    move            a1, a0
>> +    bstrins.d       a0, zero, 4, 0
>> +    vld             vr0, a0, 0
>> +    vld             vr1, a0, 16
>> +
>> +    li.d            t1, -1
>> +    vmsknz.b        vr0, vr0
>> +    vmsknz.b        vr1, vr1
>> +    vilvl.h         vr0, vr1, vr0
>> +
>> +    movfr2gr.s      t0, fa0
>> +    sra.w           t0, t0, a1
>> +    beq             t0, t1, L(loop)
>> +    cto.w           a0, t0
>> +
>> +    jr              ra
>> +    nop
>> +    nop
>> +    nop
>> +
>> +
>> +L(loop):
>> +    vld             vr0, a0, 32
>> +    vld             vr1, a0, 48
>> +    addi.d          a0, a0, 32
>> +    vmin.bu         vr2, vr0, vr1
>> +
>> +    vsetanyeqz.b    fcc0, vr2
>> +    bceqz           fcc0, L(loop)
>> +    vmsknz.b        vr0, vr0
>> +    vmsknz.b        vr1, vr1
>> +
>> +    vilvl.h         vr0, vr1, vr0
>> +    sub.d           a0, a0, a1
>> +    movfr2gr.s      t0, fa0
>> +    cto.w           t0, t0
>> +
>> +    add.d           a0, a0, t0
>> +    jr              ra
>> +END(STRLEN)
>> +
>> +#ifdef _LIBC
>> +libc_hidden_builtin_def (STRLEN)
>> +#endif
>> +
>> +#endif
> This implementation fails to assembler with binutils 2.40.0.20230525:
>
> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages:
> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld  $vr0,$r4,0
> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld  $vr1,$r4,16
> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:34: Error: no match insn: vmsknz.b     $vr0,$vr0
> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:35: Error: no match insn: vmsknz.b     $vr1,$vr1
> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:36: Error: no match insn: vilvl.h      $vr0,$vr1,$vr0
> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:50: Error: no match insn: vld  $vr0,$r4,32
> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:51: Error: no match insn: vld  $vr1,$r4,48
> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:53: Error: no match insn: vmin.bu      $vr2,$vr0,$vr1
> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:55: Error: no match insn: vsetanyeqz.b $fcc0,$vr2
> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:57: Error: no match insn: vmsknz.b     $vr0,$vr0
> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:58: Error: no match insn: vmsknz.b     $vr1,$vr1
> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:60: Error: no match insn: vilvl.h      $vr0,$vr1,$vr0
>
>> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen.c b/sysdeps/loongarch/lp64/multiarch/strlen.c
>> new file mode 100644
>> index 0000000000..416ed0d9e2
>> --- /dev/null
>> +++ b/sysdeps/loongarch/lp64/multiarch/strlen.c
>> @@ -0,0 +1,37 @@
>> +/* Multiple versions of strlen.
>> +   All versions must be listed in ifunc-impl-list.c.
>> +   Copyright (C) 2017-2023 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +/* Define multiple versions only for the definition in libc.  */
>> +
>> +#if IS_IN (libc)
>> +# define strlen __redirect_strlen
>> +# include <string.h>
>> +# undef strlen
>> +
>> +# define SYMBOL_NAME strlen
>> +# include "ifunc-strlen.h"
>> +
>> +libc_ifunc_redirected (__redirect_strlen, strlen, IFUNC_SELECTOR ());
>> +
>> +# ifdef SHARED
>> +__hidden_ver1 (strlen, __GI_strlen, __redirect_strlen)
>> +  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlen);
>> +# endif
>> +
>> +#endif
>> diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h
>> index 5100f36d24..524d2e3277 100644
>> --- a/sysdeps/loongarch/sys/regdef.h
>> +++ b/sysdeps/loongarch/sys/regdef.h
>> @@ -89,6 +89,14 @@
>>   #define fs5 $f29
>>   #define fs6 $f30
>>   #define fs7 $f31
>> +#define fcc0 $fcc0
>> +#define fcc1 $fcc1
>> +#define fcc2 $fcc2
>> +#define fcc3 $fcc3
>> +#define fcc4 $fcc4
>> +#define fcc5 $fcc5
>> +#define fcc6 $fcc6
>> +#define fcc7 $fcc7
>>   
>>   #define vr0 $vr0
>>   #define vr1 $vr1
>> @@ -98,6 +106,30 @@
>>   #define vr5 $vr5
>>   #define vr6 $vr6
>>   #define vr7 $vr7
>> +#define vr8 $vr8
>> +#define vr9 $vr9
>> +#define vr10 $vr10
>> +#define vr11 $vr11
>> +#define vr12 $vr12
>> +#define vr13 $vr13
>> +#define vr14 $vr14
>> +#define vr15 $vr15
>> +#define vr16 $vr16
>> +#define vr17 $vr17
>> +#define vr18 $vr18
>> +#define vr19 $vr19
>> +#define vr20 $vr20
>> +#define vr21 $vr21
>> +#define vr22 $vr22
>> +#define vr23 $vr23
>> +#define vr24 $vr24
>> +#define vr25 $vr25
>> +#define vr26 $vr26
>> +#define vr27 $vr27
>> +#define vr28 $vr28
>> +#define vr29 $vr29
>> +#define vr30 $vr30
>> +#define vr31 $vr31
>>   
>>   #define xr0 $xr0
>>   #define xr1 $xr1
>> @@ -107,5 +139,30 @@
>>   #define xr5 $xr5
>>   #define xr6 $xr6
>>   #define xr7 $xr7
>> +#define xr7 $xr7
>> +#define xr8 $xr8
>> +#define xr9 $xr9
>> +#define xr10 $xr10
>> +#define xr11 $xr11
>> +#define xr12 $xr12
>> +#define xr13 $xr13
>> +#define xr14 $xr14
>> +#define xr15 $xr15
>> +#define xr16 $xr16
>> +#define xr17 $xr17
>> +#define xr18 $xr18
>> +#define xr19 $xr19
>> +#define xr20 $xr20
>> +#define xr21 $xr21
>> +#define xr22 $xr22
>> +#define xr23 $xr23
>> +#define xr24 $xr24
>> +#define xr25 $xr25
>> +#define xr26 $xr26
>> +#define xr27 $xr27
>> +#define xr28 $xr28
>> +#define xr29 $xr29
>> +#define xr30 $xr30
>> +#define xr31 $xr31
>>   
>>   #endif /* _SYS_REGDEF_H */
>> diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
>> index e371e13b15..d1a280a5ee 100644
>> --- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
>> +++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
>> @@ -25,5 +25,7 @@
>>   #define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
>>   #define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
>>   
>> +#define INIT_ARCH()
>> +
>>   #endif /* _CPU_FEATURES_LOONGARCH64_H  */
>>
  
dengjianbo Aug. 2, 2023, 12:25 p.m. UTC | #4
>>>
>>> On 2023-08-02 10:31, Adhemerval Zanella Netto wrote:
>>> +#if IS_IN (libc)
>>> +# define STRLEN __strlen_aligned
>>> +#else
>>> +# define STRLEN strlen
>>> +#endif
>> Is this really an improvement over the generic implementation? It 
>> seems to
>> use a quite similar strategy.
Comparing with the code generated by compiler, the assembly code does an 
16bytes loop
unrolling, and handles ascii data and non-ascii data separately which 
could take less
instructions to calculate the length of  ascii data. besides, the 
assembly code using
fewer instructions to start the loop. I think the performance 
improvement benefits from
this. Please kindly check bench result also from:
https://github.com/jiadengx/glibc_test/blob/main/strlen/bench-strlen.out
>> This implementation fails to assembler with binutils 2.40.0.20230525:
>>
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:31: Error: no match 
>> insn: xvld        $xr0,$r4,0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:33: Error: no match 
>> insn: xvmsknz.b   $xr0,$xr0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:34: Error: no match 
>> insn: xvpickve.w  $xr1,$xr0,4
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:35: Error: no match 
>> insn: vilvl.h     $vr0,$vr1,$vr0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:44: Error: no match 
>> insn: xvld        $xr0,$r4,32
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:46: Error: no match 
>> insn: xvsetanyeqz.b       $fcc0,$xr0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:50: Error: no match 
>> insn: xvmsknz.b   $xr0,$xr0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:52: Error: no match 
>> insn: xvpickve.w  $xr1,$xr0,4
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:53: Error: no match 
>> insn: vilvl.h     $vr0,$vr1,$vr0
>>
>> You need to either add a configure option to increase the minimum 
>> required
>> binutils or add a macro to synthetize the instruction on older binutils
>> (similar to what sysdeps/powerpc/powerpc64/le/power9/strncmp.S does).
Configuration variable loongarch_vec_asm has been added in patch v2, 
when doing the configuration,
it will check if the assembler supports LSX/LASX and decides whether 
strlen LSX/LASX code get compiled.

diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac
index 39efccfd8f..9fadf7bb9d 100644
--- a/sysdeps/loongarch/configure.ac
+++ b/sysdeps/loongarch/configure.ac
@@ -74,6 +74,8 @@ else
    libc_cv_loongarch_vec_asm=no
  fi
  rm -f conftest*])
+LIBC_CONFIG_VAR([loongarch_vec_asm], [$libc_cv_loongarch_vec_asm])
+
  if test $libc_cv_loongarch_vec_asm = yes; then
    AC_DEFINE(HAVE_LOONGARCH_VEC_ASM)
  fi
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
new file mode 100644
index 0000000000..73b7f61969
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -0,0 +1,11 @@
+ifeq ($(subdir),string)
+sysdep_routines += strlen-aligned \
+	# sysdep_routines
+
+ifeq ($(loongarch_vec_asm), yes)
+sysdep_routines += strlen-lsx \
+	strlen-lasx \
+	# sysdep_routines
+endif
+
+endif

Detailed info can be find from:
https://sourceware.org/pipermail/libc-alpha/2023-August/150566.html
>> This implementation fails to assembler with binutils 2.40.0.20230525:
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages:
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match 
>> insn: vld  $vr0,$r4,0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match 
>> insn: vld  $vr1,$r4,16
>>
Sorry, it's my mistake for the wrong version of binutils. Could you 
please try the latest release
version 2.41?

Following issues is  also fixed in the patch v2:
1. Missing one line comment.
2. I think it should be only 2023 here and for other new file as well. 
(Copyright)


On 2023-08-02 09:25, caiyinyu wrote:
>
> 在 2023/8/1 下午10:31, Adhemerval Zanella Netto 写道:
>>
>> On 01/08/23 04:09, dengjianbo wrote:
>>> 1. strlen-lasx is implemeted by LASX simd instructions(256bit)
>>> 2. strlen-lsx is implemeted by LSX simd instructions(128bit)
>>> 3. strlen-align is implemented by LA basic instructions and never 
>>> use unaligned memory acess
>> Usually optimization routines are added along benchmarks number to show
>> the expected improvements over different sizes and alignment.
>
> The performance test plots for these functions over different sizes 
> and alignment are here:
> https://github.com/jiadengx/glibc_test/blob/main/strlen/strlen_align.png
> https://github.com/jiadengx/glibc_test/blob/main/strlen/strlen_lasx.png
> https://github.com/jiadengx/glibc_test/blob/main/strlen/strlen_lsx.png
>
>
>>
>>> ---
>>>   sysdeps/loongarch/lp64/multiarch/Makefile     |   3 +
>>>   .../lp64/multiarch/ifunc-impl-list.c          |  39 +++++++
>>>   .../loongarch/lp64/multiarch/ifunc-strlen.h   |  36 +++++++
>>>   .../loongarch/lp64/multiarch/strlen-aligned.S | 101 
>>> ++++++++++++++++++
>>>   .../loongarch/lp64/multiarch/strlen-lasx.S    |  65 +++++++++++
>>>   sysdeps/loongarch/lp64/multiarch/strlen-lsx.S |  73 +++++++++++++
>>>   sysdeps/loongarch/lp64/multiarch/strlen.c     |  37 +++++++
>>>   sysdeps/loongarch/sys/regdef.h                |  57 ++++++++++
>>>   .../unix/sysv/linux/loongarch/cpu-features.h  |   2 +
>>>   9 files changed, 413 insertions(+)
>>>   create mode 100644 sysdeps/loongarch/lp64/multiarch/Makefile
>>>   create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
>>>   create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
>>>   create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
>>>   create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
>>>   create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
>>>   create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen.c
>>>
>>> diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile 
>>> b/sysdeps/loongarch/lp64/multiarch/Makefile
>>> new file mode 100644
>>> index 0000000000..529a8b6bab
>>> --- /dev/null
>>> +++ b/sysdeps/loongarch/lp64/multiarch/Makefile
>>> @@ -0,0 +1,3 @@
>>> +ifeq ($(subdir),string)
>>> +sysdep_routines += strlen-aligned strlen-lsx strlen-lasx
>>> +endif
>> One entry per line:
>>
>> sysdep_routines += \
>>    strlen-aligned \
>>    strlen-lsx \
>>    strlen-lasx \
>>    # sysdep_routines
>>
>>> diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c 
>>> b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
>>> new file mode 100644
>>> index 0000000000..b35e41127e
>>> --- /dev/null
>>> +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
>>> @@ -0,0 +1,39 @@
>>> +/* Enumerate available IFUNC implementations of a function. 
>>> LoongArch64 version.
>>> +   Copyright (C) 2017-2023 Free Software Foundation, Inc.
>> I think it should be only 2023 here and for other new file as well.
>>
>>> +   This file is part of the GNU C Library.
>>> +
>>> +   The GNU C Library is free software; you can redistribute it and/or
>>> +   modify it under the terms of the GNU Lesser General Public
>>> +   License as published by the Free Software Foundation; either
>>> +   version 2.1 of the License, or (at your option) any later version.
>>> +
>>> +   The GNU C Library is distributed in the hope that it will be 
>>> useful,
>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +   Lesser General Public License for more details.
>>> +
>>> +   You should have received a copy of the GNU Lesser General Public
>>> +   License along with the GNU C Library; if not, see
>>> +   <http://www.gnu.org/licenses/>.  */
>>> +
>>> +#include <assert.h>
>>> +#include <string.h>
>>> +#include <wchar.h>
>>> +#include <ldsodefs.h>
>>> +#include <ifunc-impl-list.h>
>>> +#include <stdio.h>
>>> +
>>> +size_t
>>> +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl 
>>> *array,
>>> +            size_t max)
>>> +{
>>> +
>>> +  size_t i = max;
>>> +
>>> +  IFUNC_IMPL (i, name, strlen,
>>> +          IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, 
>>> __strlen_lasx)
>>> +          IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx)
>>> +          IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
>>> +          )
>>> +  return i;
>>> +}
>>> diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h 
>>> b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
>>> new file mode 100644
>>> index 0000000000..e2b3490f39
>>> --- /dev/null
>>> +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
>>> @@ -0,0 +1,36 @@
>>> +/* Common definition for strlen implementation.
>>> +   All versions must be listed in ifunc-impl-list.c.
>>> +   Copyright (C) 2017-2023 Free Software Foundation, Inc.
>>> +   This file is part of the GNU C Library.
>>> +
>>> +   The GNU C Library is free software; you can redistribute it and/or
>>> +   modify it under the terms of the GNU Lesser General Public
>>> +   License as published by the Free Software Foundation; either
>>> +   version 2.1 of the License, or (at your option) any later version.
>>> +
>>> +   The GNU C Library is distributed in the hope that it will be 
>>> useful,
>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +   Lesser General Public License for more details.
>>> +
>>> +   You should have received a copy of the GNU Lesser General Public
>>> +   License along with the GNU C Library; if not, see
>>> +   <https://www.gnu.org/licenses/>.  */
>>> +
>>> +#include <ldsodefs.h>
>>> +#include <ifunc-init.h>
>>> +
>>> +extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
>>> +extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
>>> +extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
>>> +
>>> +static inline void *
>>> +IFUNC_SELECTOR (void)
>>> +{
>>> +  if (SUPPORT_LASX)
>>> +    return OPTIMIZE (lasx);
>>> +  else if (SUPPORT_LSX)
>>> +    return OPTIMIZE (lsx);
>>> +  else
>>> +    return OPTIMIZE (aligned);
>>> +}
>>> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S 
>>> b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
>>> new file mode 100644
>>> index 0000000000..b379e978a7
>>> --- /dev/null
>>> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
>>> @@ -0,0 +1,101 @@
>>> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
>>> +
>>> +   This file is part of the GNU C Library.
>>> +
>>> +   The GNU C Library is free software; you can redistribute it and/or
>>> +   modify it under the terms of the GNU Lesser General Public
>>> +   License as published by the Free Software Foundation; either
>>> +   version 2.1 of the License, or (at your option) any later version.
>>> +
>>> +   The GNU C Library is distributed in the hope that it will be 
>>> useful,
>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +   Lesser General Public License for more details.
>>> +
>>> +   You should have received a copy of the GNU Lesser General Public
>>> +   License along with the GNU C Library.  If not, see
>>> +   <https://www.gnu.org/licenses/>.  */
>>> +
>>> +#include <sysdep.h>
>>> +#include <sys/regdef.h>
>>> +#include <sys/asm.h>
>>> +
>>> +#if IS_IN (libc)
>>> +# define STRLEN __strlen_aligned
>>> +#else
>>> +# define STRLEN strlen
>>> +#endif
>> Is this really an improvement over the generic implementation? It 
>> seems to
>> use a quite similar strategy.
>>
>>> +
>>> +LEAF(STRLEN, 6)
>>> +    move        a1, a0
>>> +    bstrins.d   a0, zero, 2, 0
>>> +    lu12i.w     a2, 0x01010
>>> +    li.w        t0, -1
>>> +
>>> +    ld.d        t2, a0, 0
>>> +    andi        t1, a1, 0x7
>>> +    ori         a2, a2, 0x101
>>> +    slli.d      t1, t1, 3
>>> +
>>> +    bstrins.d   a2, a2, 63, 32
>>> +    sll.d       t1, t0, t1
>>> +    slli.d      t3, a2, 7
>>> +    nor         a3, zero, t3
>>> +
>>> +    orn         t2, t2, t1
>>> +    sub.d       t0, t2, a2
>>> +    nor         t1, t2, a3
>>> +    and         t0, t0, t1
>>> +
>>> +
>>> +    bnez        t0, L(count_pos)
>>> +    addi.d      a0, a0, 8
>>> +L(loop_16_7bit):
>>> +    ld.d        t2, a0, 0
>>> +    sub.d       t1, t2, a2
>>> +
>>> +    and         t0, t1, t3
>>> +    bnez        t0, L(more_check)
>>> +    ld.d        t2, a0, 8
>>> +    sub.d       t1, t2, a2
>>> +
>>> +    and         t0, t1, t3
>>> +    addi.d      a0, a0, 16
>>> +    beqz        t0, L(loop_16_7bit)
>>> +    addi.d      a0, a0, -8
>>> +
>>> +L(more_check):
>>> +    nor         t0, t2, a3
>>> +    and         t0, t1, t0
>>> +    bnez        t0, L(count_pos)
>>> +    addi.d      a0, a0, 8
>>> +
>>> +
>>> +L(loop_16_8bit):
>>> +    ld.d        t2, a0, 0
>>> +    sub.d       t1, t2, a2
>>> +    nor         t0, t2, a3
>>> +    and         t0, t0, t1
>>> +
>>> +    bnez        t0, L(count_pos)
>>> +    ld.d        t2, a0, 8
>>> +    addi.d      a0, a0, 16
>>> +    sub.d       t1, t2, a2
>>> +
>>> +    nor         t0, t2, a3
>>> +    and         t0, t0, t1
>>> +    beqz        t0, L(loop_16_8bit)
>>> +    addi.d      a0, a0, -8
>>> +
>>> +L(count_pos):
>>> +    ctz.d       t1, t0
>>> +    sub.d       a0, a0, a1
>>> +    srli.d      t1, t1, 3
>>> +    add.d       a0, a0, t1
>>> +
>>> +    jr          ra
>>> +END(STRLEN)
>>> +
>>> +#ifdef _LIBC
>>> +libc_hidden_builtin_def (STRLEN)
>>> +#endif
>>> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S 
>>> b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
>>> new file mode 100644
>>> index 0000000000..56ac6403d3
>>> --- /dev/null
>>> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
>>> @@ -0,0 +1,65 @@
>> Missing one line comment.
>>
>>> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
>>> +
>>> +   This file is part of the GNU C Library.
>>> +
>>> +   The GNU C Library is free software; you can redistribute it and/or
>>> +   modify it under the terms of the GNU Lesser General Public
>>> +   License as published by the Free Software Foundation; either
>>> +   version 2.1 of the License, or (at your option) any later version.
>>> +
>>> +   The GNU C Library is distributed in the hope that it will be 
>>> useful,
>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +   Lesser General Public License for more details.
>>> +
>>> +   You should have received a copy of the GNU Lesser General Public
>>> +   License along with the GNU C Library.  If not, see
>>> +   <https://www.gnu.org/licenses/>.  */
>>> +
>>> +#include <sysdep.h>
>>> +#include <sys/regdef.h>
>>> +#include <sys/asm.h>
>>> +
>>> +#if IS_IN (libc)
>>> +
>>> +# define STRLEN __strlen_lasx
>>> +
>>> +LEAF(STRLEN, 6)
>>> +    move            a1, a0
>>> +    bstrins.d       a0, zero, 4, 0
>>> +    li.d            t1, -1
>>> +    xvld            xr0, a0, 0
>>> +
>>> +    xvmsknz.b       xr0, xr0
>>> +    xvpickve.w      xr1, xr0, 4
>>> +    vilvl.h         vr0, vr1, vr0
>>> +    movfr2gr.s      t0, fa0  # sign extend
>>> +
>>> +    sra.w           t0, t0, a1
>>> +    beq             t0, t1, L(loop)
>>> +    cto.w           a0, t0
>>> +    jr              ra
>>> +
>>> +L(loop):
>>> +    xvld            xr0, a0, 32
>>> +    addi.d          a0, a0, 32
>>> +    xvsetanyeqz.b   fcc0, xr0
>>> +    bceqz           fcc0, L(loop)
>>> +
>>> +
>>> +    xvmsknz.b       xr0, xr0
>>> +    sub.d           a0, a0, a1
>>> +    xvpickve.w      xr1, xr0, 4
>>> +    vilvl.h         vr0, vr1, vr0
>>> +
>>> +    movfr2gr.s      t0, fa0
>>> +    cto.w           t0, t0
>>> +    add.d           a0, a0, t0
>>> +    jr              ra
>>> +END(STRLEN)
>>> +
>>> +#ifdef _LIBC
>>> +libc_hidden_builtin_def (STRLEN)
>>> +#endif
>>> +
>>> +#endif
>> This implementation fails to assembler with binutils 2.40.0.20230525:
>>
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:31: Error: no match 
>> insn: xvld        $xr0,$r4,0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:33: Error: no match 
>> insn: xvmsknz.b   $xr0,$xr0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:34: Error: no match 
>> insn: xvpickve.w  $xr1,$xr0,4
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:35: Error: no match 
>> insn: vilvl.h     $vr0,$vr1,$vr0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:44: Error: no match 
>> insn: xvld        $xr0,$r4,32
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:46: Error: no match 
>> insn: xvsetanyeqz.b       $fcc0,$xr0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:50: Error: no match 
>> insn: xvmsknz.b   $xr0,$xr0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:52: Error: no match 
>> insn: xvpickve.w  $xr1,$xr0,4
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:53: Error: no match 
>> insn: vilvl.h     $vr0,$vr1,$vr0
>>
>> You need to either add a configure option to increase the minimum 
>> required
>> binutils or add a macro to synthetize the instruction on older binutils
>> (similar to what sysdeps/powerpc/powerpc64/le/power9/strncmp.S does).
>>
>>
>>> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S 
>>> b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
>>> new file mode 100644
>>> index 0000000000..1c19c98b5b
>>> --- /dev/null
>>> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
>>> @@ -0,0 +1,73 @@
>>> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
>>> +
>>> +   This file is part of the GNU C Library.
>>> +
>>> +   The GNU C Library is free software; you can redistribute it and/or
>>> +   modify it under the terms of the GNU Lesser General Public
>>> +   License as published by the Free Software Foundation; either
>>> +   version 2.1 of the License, or (at your option) any later version.
>>> +
>>> +   The GNU C Library is distributed in the hope that it will be 
>>> useful,
>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +   Lesser General Public License for more details.
>>> +
>>> +   You should have received a copy of the GNU Lesser General Public
>>> +   License along with the GNU C Library.  If not, see
>>> +   <https://www.gnu.org/licenses/>.  */
>>> +
>>> +#include <sysdep.h>
>>> +#include <sys/regdef.h>
>>> +#include <sys/asm.h>
>>> +
>>> +#if IS_IN (libc)
>>> +
>>> +# define STRLEN __strlen_lsx
>>> +
>>> +LEAF(STRLEN, 6)
>>> +    move            a1, a0
>>> +    bstrins.d       a0, zero, 4, 0
>>> +    vld             vr0, a0, 0
>>> +    vld             vr1, a0, 16
>>> +
>>> +    li.d            t1, -1
>>> +    vmsknz.b        vr0, vr0
>>> +    vmsknz.b        vr1, vr1
>>> +    vilvl.h         vr0, vr1, vr0
>>> +
>>> +    movfr2gr.s      t0, fa0
>>> +    sra.w           t0, t0, a1
>>> +    beq             t0, t1, L(loop)
>>> +    cto.w           a0, t0
>>> +
>>> +    jr              ra
>>> +    nop
>>> +    nop
>>> +    nop
>>> +
>>> +
>>> +L(loop):
>>> +    vld             vr0, a0, 32
>>> +    vld             vr1, a0, 48
>>> +    addi.d          a0, a0, 32
>>> +    vmin.bu         vr2, vr0, vr1
>>> +
>>> +    vsetanyeqz.b    fcc0, vr2
>>> +    bceqz           fcc0, L(loop)
>>> +    vmsknz.b        vr0, vr0
>>> +    vmsknz.b        vr1, vr1
>>> +
>>> +    vilvl.h         vr0, vr1, vr0
>>> +    sub.d           a0, a0, a1
>>> +    movfr2gr.s      t0, fa0
>>> +    cto.w           t0, t0
>>> +
>>> +    add.d           a0, a0, t0
>>> +    jr              ra
>>> +END(STRLEN)
>>> +
>>> +#ifdef _LIBC
>>> +libc_hidden_builtin_def (STRLEN)
>>> +#endif
>>> +
>>> +#endif
>> This implementation fails to assembler with binutils 2.40.0.20230525:
>>
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages:
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match 
>> insn: vld  $vr0,$r4,0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match 
>> insn: vld  $vr1,$r4,16
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:34: Error: no match 
>> insn: vmsknz.b     $vr0,$vr0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:35: Error: no match 
>> insn: vmsknz.b     $vr1,$vr1
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:36: Error: no match 
>> insn: vilvl.h      $vr0,$vr1,$vr0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:50: Error: no match 
>> insn: vld  $vr0,$r4,32
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:51: Error: no match 
>> insn: vld  $vr1,$r4,48
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:53: Error: no match 
>> insn: vmin.bu      $vr2,$vr0,$vr1
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:55: Error: no match 
>> insn: vsetanyeqz.b $fcc0,$vr2
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:57: Error: no match 
>> insn: vmsknz.b     $vr0,$vr0
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:58: Error: no match 
>> insn: vmsknz.b     $vr1,$vr1
>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:60: Error: no match 
>> insn: vilvl.h      $vr0,$vr1,$vr0
>>
>>> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen.c 
>>> b/sysdeps/loongarch/lp64/multiarch/strlen.c
>>> new file mode 100644
>>> index 0000000000..416ed0d9e2
>>> --- /dev/null
>>> +++ b/sysdeps/loongarch/lp64/multiarch/strlen.c
>>> @@ -0,0 +1,37 @@
>>> +/* Multiple versions of strlen.
>>> +   All versions must be listed in ifunc-impl-list.c.
>>> +   Copyright (C) 2017-2023 Free Software Foundation, Inc.
>>> +   This file is part of the GNU C Library.
>>> +
>>> +   The GNU C Library is free software; you can redistribute it and/or
>>> +   modify it under the terms of the GNU Lesser General Public
>>> +   License as published by the Free Software Foundation; either
>>> +   version 2.1 of the License, or (at your option) any later version.
>>> +
>>> +   The GNU C Library is distributed in the hope that it will be 
>>> useful,
>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +   Lesser General Public License for more details.
>>> +
>>> +   You should have received a copy of the GNU Lesser General Public
>>> +   License along with the GNU C Library; if not, see
>>> +   <https://www.gnu.org/licenses/>.  */
>>> +
>>> +/* Define multiple versions only for the definition in libc. */
>>> +
>>> +#if IS_IN (libc)
>>> +# define strlen __redirect_strlen
>>> +# include <string.h>
>>> +# undef strlen
>>> +
>>> +# define SYMBOL_NAME strlen
>>> +# include "ifunc-strlen.h"
>>> +
>>> +libc_ifunc_redirected (__redirect_strlen, strlen, IFUNC_SELECTOR ());
>>> +
>>> +# ifdef SHARED
>>> +__hidden_ver1 (strlen, __GI_strlen, __redirect_strlen)
>>> +  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlen);
>>> +# endif
>>> +
>>> +#endif
>>> diff --git a/sysdeps/loongarch/sys/regdef.h 
>>> b/sysdeps/loongarch/sys/regdef.h
>>> index 5100f36d24..524d2e3277 100644
>>> --- a/sysdeps/loongarch/sys/regdef.h
>>> +++ b/sysdeps/loongarch/sys/regdef.h
>>> @@ -89,6 +89,14 @@
>>>   #define fs5 $f29
>>>   #define fs6 $f30
>>>   #define fs7 $f31
>>> +#define fcc0 $fcc0
>>> +#define fcc1 $fcc1
>>> +#define fcc2 $fcc2
>>> +#define fcc3 $fcc3
>>> +#define fcc4 $fcc4
>>> +#define fcc5 $fcc5
>>> +#define fcc6 $fcc6
>>> +#define fcc7 $fcc7
>>>     #define vr0 $vr0
>>>   #define vr1 $vr1
>>> @@ -98,6 +106,30 @@
>>>   #define vr5 $vr5
>>>   #define vr6 $vr6
>>>   #define vr7 $vr7
>>> +#define vr8 $vr8
>>> +#define vr9 $vr9
>>> +#define vr10 $vr10
>>> +#define vr11 $vr11
>>> +#define vr12 $vr12
>>> +#define vr13 $vr13
>>> +#define vr14 $vr14
>>> +#define vr15 $vr15
>>> +#define vr16 $vr16
>>> +#define vr17 $vr17
>>> +#define vr18 $vr18
>>> +#define vr19 $vr19
>>> +#define vr20 $vr20
>>> +#define vr21 $vr21
>>> +#define vr22 $vr22
>>> +#define vr23 $vr23
>>> +#define vr24 $vr24
>>> +#define vr25 $vr25
>>> +#define vr26 $vr26
>>> +#define vr27 $vr27
>>> +#define vr28 $vr28
>>> +#define vr29 $vr29
>>> +#define vr30 $vr30
>>> +#define vr31 $vr31
>>>     #define xr0 $xr0
>>>   #define xr1 $xr1
>>> @@ -107,5 +139,30 @@
>>>   #define xr5 $xr5
>>>   #define xr6 $xr6
>>>   #define xr7 $xr7
>>> +#define xr7 $xr7
>>> +#define xr8 $xr8
>>> +#define xr9 $xr9
>>> +#define xr10 $xr10
>>> +#define xr11 $xr11
>>> +#define xr12 $xr12
>>> +#define xr13 $xr13
>>> +#define xr14 $xr14
>>> +#define xr15 $xr15
>>> +#define xr16 $xr16
>>> +#define xr17 $xr17
>>> +#define xr18 $xr18
>>> +#define xr19 $xr19
>>> +#define xr20 $xr20
>>> +#define xr21 $xr21
>>> +#define xr22 $xr22
>>> +#define xr23 $xr23
>>> +#define xr24 $xr24
>>> +#define xr25 $xr25
>>> +#define xr26 $xr26
>>> +#define xr27 $xr27
>>> +#define xr28 $xr28
>>> +#define xr29 $xr29
>>> +#define xr30 $xr30
>>> +#define xr31 $xr31
>>>     #endif /* _SYS_REGDEF_H */
>>> diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h 
>>> b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
>>> index e371e13b15..d1a280a5ee 100644
>>> --- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
>>> +++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
>>> @@ -25,5 +25,7 @@
>>>   #define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
>>>   #define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
>>>   +#define INIT_ARCH()
>>> +
>>>   #endif /* _CPU_FEATURES_LOONGARCH64_H  */
  
dengjianbo Aug. 2, 2023, 12:47 p.m. UTC | #5
On 2023-08-01 22:44, Xi Ruoyao wrote:
> On Tue, 2023-08-01 at 15:09 +0800, dengjianbo wrote:
>
> /* snip */
>
>> diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile
>> b/sysdeps/loongarch/lp64/multiarch/Makefile
>> new file mode 100644
>> index 0000000000..529a8b6bab
>> --- /dev/null
>> +++ b/sysdeps/loongarch/lp64/multiarch/Makefile
>> @@ -0,0 +1,3 @@
>> +ifeq ($(subdir),string)
>> +sysdep_routines += strlen-aligned strlen-lsx strlen-lasx
>> +endif
> Please check if the assembler supports LSX/LASX, if not you should not
> add strlen-lsx and strlen-lasx here.  We don't want to disallow building
> Glibc for LoongArch with old assembler.
>
A new configuration variable loongarch_vec_asm has been added in patch
v2. when doing the configuration, it will check if the assembler
supports LSX/LASX and set the corresponding value. Then we can check it
in the makefile, decides if the strlen LASX/LSX code can be compiled.

+LIBC_CONFIG_VAR([loongarch_vec_asm], [$libc_cv_loongarch_vec_asm])
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -0,0 +1,11 @@
+ifeq ($(subdir),string)
+sysdep_routines += strlen-aligned \
+       # sysdep_routines
+
+ifeq ($(loongarch_vec_asm), yes)
+sysdep_routines += strlen-lsx \
+       strlen-lasx \
+       # sysdep_routines
+endif
+
+endif

For detailed info, please kindly find from:
https://sourceware.org/pipermail/libc-alpha/2023-August/150566.html
  
Adhemerval Zanella Netto Aug. 2, 2023, 12:59 p.m. UTC | #6
On 02/08/23 09:25, dengjianbo wrote:
>>>>
>>>> On 2023-08-02 10:31, Adhemerval Zanella Netto wrote:
>>>> +#if IS_IN (libc)
>>>> +# define STRLEN __strlen_aligned
>>>> +#else
>>>> +# define STRLEN strlen
>>>> +#endif
>>> Is this really an improvement over the generic implementation? It seems to
>>> use a quite similar strategy.
> Comparing with the code generated by compiler, the assembly code does an 16bytes loop
> unrolling, and handles ascii data and non-ascii data separately which could take less
> instructions to calculate the length of  ascii data. besides, the assembly code using
> fewer instructions to start the loop. I think the performance improvement benefits from
> this. Please kindly check bench result also from:
> https://github.com/jiadengx/glibc_test/blob/main/strlen/bench-strlen.out

From the summarized results [1], it seems that the initial start to mask
off unaligned inputs are slight better.  The __strlen_aligned onl seems
better to sizes larger than 32 (the 16 lenght results seems strange).
Maybe you coult improve shift_find/find_zero_all/index_first on loongarch.

Does it improve by explicit instructing compiler to unroll the loop?

diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
index 43d2f583cd..d807a5e0d2 100644
--- a/sysdeps/loongarch/Makefile
+++ b/sysdeps/loongarch/Makefile
@@ -15,3 +15,7 @@ ASFLAGS-.os += $(pic-ccflag)
 ifeq (yes,$(have-cmodel-medium))
 CFLAGS-.oS += -mcmodel=medium
 endif
+
+ifeq ($(subdir),string)
+CFLAGS-strlen.c += -funroll-all-loops --param max-variable-expansions-in-unroller=2
+endif

[1] https://github.com/jiadengx/glibc_test/blob/main/strlen/strlen_align.png

>>> This implementation fails to assembler with binutils 2.40.0.20230525:
>>>
>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:31: Error: no match insn: xvld        $xr0,$r4,0
>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:33: Error: no match insn: xvmsknz.b   $xr0,$xr0
>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:34: Error: no match insn: xvpickve.w  $xr1,$xr0,4
>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:35: Error: no match insn: vilvl.h     $vr0,$vr1,$vr0
>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:44: Error: no match insn: xvld        $xr0,$r4,32
>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:46: Error: no match insn: xvsetanyeqz.b       $fcc0,$xr0
>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:50: Error: no match insn: xvmsknz.b   $xr0,$xr0
>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:52: Error: no match insn: xvpickve.w  $xr1,$xr0,4
>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:53: Error: no match insn: vilvl.h     $vr0,$vr1,$vr0
>>>
>>> You need to either add a configure option to increase the minimum required
>>> binutils or add a macro to synthetize the instruction on older binutils
>>> (similar to what sysdeps/powerpc/powerpc64/le/power9/strncmp.S does).
> Configuration variable loongarch_vec_asm has been added in patch v2, when doing the configuration,
> it will check if the assembler supports LSX/LASX and decides whether strlen LSX/LASX code get compiled.
> 



> diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac
> index 39efccfd8f..9fadf7bb9d 100644
> --- a/sysdeps/loongarch/configure.ac
> +++ b/sysdeps/loongarch/configure.ac
> @@ -74,6 +74,8 @@ else
>    libc_cv_loongarch_vec_asm=no
>  fi
>  rm -f conftest*])
> +LIBC_CONFIG_VAR([loongarch_vec_asm], [$libc_cv_loongarch_vec_asm])
> +
>  if test $libc_cv_loongarch_vec_asm = yes; then
>    AC_DEFINE(HAVE_LOONGARCH_VEC_ASM)
>  fi
> diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
> new file mode 100644
> index 0000000000..73b7f61969
> --- /dev/null
> +++ b/sysdeps/loongarch/lp64/multiarch/Makefile
> @@ -0,0 +1,11 @@
> +ifeq ($(subdir),string)
> +sysdep_routines += strlen-aligned \
> +	# sysdep_routines
> +
> +ifeq ($(loongarch_vec_asm), yes)
> +sysdep_routines += strlen-lsx \
> +	strlen-lasx \
> +	# sysdep_routines
> +endif
> +
> +endif
> 
> Detailed info can be find from:
> https://sourceware.org/pipermail/libc-alpha/2023-August/150566.html
>>> This implementation fails to assembler with binutils 2.40.0.20230525:
>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages:
>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld  $vr0,$r4,0
>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld  $vr1,$r4,16
>>>
> Sorry, it's my mistake for the wrong version of binutils. Could you please try the latest release
> version 2.41?

Although it should work, it is unexpected that depending of the assembler used
some optimized routines are not enabled. 

> 
> Following issues is  also fixed in the patch v2:
> 1. Missing one line comment.
> 2. I think it should be only 2023 here and for other new file as well. (Copyright)
  
dengjianbo Aug. 3, 2023, 1:27 p.m. UTC | #7
On 2023-08-02 20:59, Adhemerval Zanella Netto wrote:
>>>>> On 2023-08-02 10:31, Adhemerval Zanella Netto wrote:
>>>>> +#if IS_IN (libc)
>>>>> +# define STRLEN __strlen_aligned
>>>>> +#else
>>>>> +# define STRLEN strlen
>>>>> +#endif
>>>> Is this really an improvement over the generic implementation? It seems to
>>>> use a quite similar strategy.
>> Comparing with the code generated by compiler, the assembly code does an 16bytes loop
>> unrolling, and handles ascii data and non-ascii data separately which could take less
>> instructions to calculate the length of  ascii data. besides, the assembly code using
>> fewer instructions to start the loop. I think the performance improvement benefits from
>> this. Please kindly check bench result also from:
>> https://github.com/jiadengx/glibc_test/blob/main/strlen/bench-strlen.out
> From the summarized results [1], it seems that the initial start to mask
> off unaligned inputs are slight better.  The __strlen_aligned onl seems
> better to sizes larger than 32 (the 16 lenght results seems strange).
> Maybe you coult improve shift_find/find_zero_all/index_first on loongarch.
>
> Does it improve by explicit instructing compiler to unroll the loop?
As you know, the assembly versions of strlen uses the same strategy to
calculate string length, if assembly code only calculate 8 bytes in the
loop and don't separate ascii and non-ascii data, the code of loop and
loop end part should be the same as the compiler generated code base on
generic strlen. Loongarch doesn't provide instructions like alpha
cmpbge, so there is no much optimizations could be done on
find_zero_all/index_first/has_zero except we can remove some BIG_ENDIAN
codes.
 
Refer to the latest test results in the chart: The assembly
implementation vs. generic strlen implementation(compiled by using
CFLAGS-strlen.c += -funroll-all-loops --param
max-variable-expandsions-in-unroller=2) the performance
improvement of the assembly implementation is evident(30% ~ 40%),
especially in cases when the length is greater than 64 bytes.
Please kindly see the results via:
https://github.com/jiadengx/glibc_test/blob/main/strlen2/bench1/generic_strlen_with_loop_unrolling.png
>>>> This implementation fails to assembler with binutils 2.40.0.20230525:
>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages:
>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld  $vr0,$r4,0
>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld  $vr1,$r4,16
>>>>
>> Sorry, it's my mistake for the wrong version of binutils. Could you please try the latest release
>> version 2.41?
> Although it should work, it is unexpected that depending of the assembler used
> some optimized routines are not enabled. 

In patch v2, an new configuration variable has been added to control
whether the LASX/LSX will be compiled according to assembler support
LASX/LSX or not, so it can be compiled with old versions of binutils.
  
Adhemerval Zanella Netto Aug. 3, 2023, 1:48 p.m. UTC | #8
On 03/08/23 10:27, dengjianbo wrote:
> 
> On 2023-08-02 20:59, Adhemerval Zanella Netto wrote:
>>>>>> On 2023-08-02 10:31, Adhemerval Zanella Netto wrote:
>>>>>> +#if IS_IN (libc)
>>>>>> +# define STRLEN __strlen_aligned
>>>>>> +#else
>>>>>> +# define STRLEN strlen
>>>>>> +#endif
>>>>> Is this really an improvement over the generic implementation? It seems to
>>>>> use a quite similar strategy.
>>> Comparing with the code generated by compiler, the assembly code does an 16bytes loop
>>> unrolling, and handles ascii data and non-ascii data separately which could take less
>>> instructions to calculate the length of  ascii data. besides, the assembly code using
>>> fewer instructions to start the loop. I think the performance improvement benefits from
>>> this. Please kindly check bench result also from:
>>> https://github.com/jiadengx/glibc_test/blob/main/strlen/bench-strlen.out
>> From the summarized results [1], it seems that the initial start to mask
>> off unaligned inputs are slight better.  The __strlen_aligned onl seems
>> better to sizes larger than 32 (the 16 lenght results seems strange).
>> Maybe you coult improve shift_find/find_zero_all/index_first on loongarch.
>>
>> Does it improve by explicit instructing compiler to unroll the loop?
> As you know, the assembly versions of strlen uses the same strategy to
> calculate string length, if assembly code only calculate 8 bytes in the
> loop and don't separate ascii and non-ascii data, the code of loop and
> loop end part should be the same as the compiler generated code base on
> generic strlen. Loongarch doesn't provide instructions like alpha
> cmpbge, so there is no much optimizations could be done on
> find_zero_all/index_first/has_zero except we can remove some BIG_ENDIAN
> codes.
>  
> Refer to the latest test results in the chart: The assembly
> implementation vs. generic strlen implementation(compiled by using
> CFLAGS-strlen.c += -funroll-all-loops --param
> max-variable-expandsions-in-unroller=2) the performance
> improvement of the assembly implementation is evident(30% ~ 40%),
> especially in cases when the length is greater than 64 bytes.
> Please kindly see the results via:
> https://github.com/jiadengx/glibc_test/blob/main/strlen2/bench1/generic_strlen_with_loop_unrolling.png

So maybe use the generic implementation plus the compiler flags to loop
unrolling instead of asm optimization?

>>>>> This implementation fails to assembler with binutils 2.40.0.20230525:
>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages:
>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld  $vr0,$r4,0
>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld  $vr1,$r4,16
>>>>>
>>> Sorry, it's my mistake for the wrong version of binutils. Could you please try the latest release
>>> version 2.41?
>> Although it should work, it is unexpected that depending of the assembler used
>> some optimized routines are not enabled. 
> 
> In patch v2, an new configuration variable has been added to control
> whether the LASX/LSX will be compiled according to assembler support
> LASX/LSX or not, so it can be compiled with old versions of binutils.

Yes I am aware and this seems odd, albeit not really wrong.  It means that
you will get less code coverage and optimizations depending of the used 
binutils. 

I would advise to follow what other architecture did to provide arch-specific 
optimization, which is either setup a minimum gcc/binutils version (for 
instance aarch64 libmvec), or encode the instructions in a binutils neutral
mode (as the powerpc implementation I pointed out).
  
Xi Ruoyao Aug. 3, 2023, 2:53 p.m. UTC | #9
On Thu, 2023-08-03 at 10:48 -0300, Adhemerval Zanella Netto wrote:
> On 03/08/23 10:27, dengjianbo wrote:
> > On 2023-08-02 20:59, Adhemerval Zanella Netto wrote:
> > > > > > > On 2023-08-02 10:31, Adhemerval Zanella Netto wrote:
> > > > > > > +#if IS_IN (libc)
> > > > > > > +# define STRLEN __strlen_aligned
> > > > > > > +#else
> > > > > > > +# define STRLEN strlen
> > > > > > > +#endif
> > > > > > Is this really an improvement over the generic implementation? It seems to
> > > > > > use a quite similar strategy.
> > > > Comparing with the code generated by compiler, the assembly code does an 16bytes loop
> > > > unrolling, and handles ascii data and non-ascii data separately which could take less
> > > > instructions to calculate the length of  ascii data. besides, the assembly code using
> > > > fewer instructions to start the loop. I think the performance improvement benefits from
> > > > this. Please kindly check bench result also from:
> > > > https://github.com/jiadengx/glibc_test/blob/main/strlen/bench-strlen.out
> > > From the summarized results [1], it seems that the initial start to mask
> > > off unaligned inputs are slight better.  The __strlen_aligned onl seems
> > > better to sizes larger than 32 (the 16 lenght results seems strange).
> > > Maybe you coult improve shift_find/find_zero_all/index_first on loongarch.
> > > 
> > > Does it improve by explicit instructing compiler to unroll the loop?
> > As you know, the assembly versions of strlen uses the same strategy to
> > calculate string length, if assembly code only calculate 8 bytes in the
> > loop and don't separate ascii and non-ascii data, the code of loop and
> > loop end part should be the same as the compiler generated code base on
> > generic strlen. Loongarch doesn't provide instructions like alpha
> > cmpbge, so there is no much optimizations could be done on
> > find_zero_all/index_first/has_zero except we can remove some BIG_ENDIAN
> > codes.

Removing them will not make any difference because the compiler will
optimized the BIG_ENDIAN paths away.

> > Refer to the latest test results in the chart: The assembly
> > implementation vs. generic strlen implementation(compiled by using
> > CFLAGS-strlen.c += -funroll-all-loops --param
> > max-variable-expandsions-in-unroller=2) the performance
> > improvement of the assembly implementation is evident(30% ~ 40%),
> > especially in cases when the length is greater than 64 bytes.
> > Please kindly see the results via:
> > https://github.com/jiadengx/glibc_test/blob/main/strlen2/bench1/generic_strlen_with_loop_unrolling.png
> 
> So maybe use the generic implementation plus the compiler flags to loop
> unrolling instead of asm optimization?

This is strange... I remember I'd attempted to add #pragma GCC unroll
for the main loop of strlen and I observed no performance gain on my
Loongson-3A5000-HV, at all.  Maybe a different test environment
(hardware, compiler version, or something)?

> > > > > > This implementation fails to assembler with binutils 2.40.0.20230525:
> > > > > > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages:
> > > > > > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld  $vr0,$r4,0
> > > > > > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld  $vr1,$r4,16
> > > > > > 
> > > > Sorry, it's my mistake for the wrong version of binutils. Could you please try the latest release
> > > > version 2.41?
> > > Although it should work, it is unexpected that depending of the assembler used
> > > some optimized routines are not enabled. 
> > 
> > In patch v2, an new configuration variable has been added to control
> > whether the LASX/LSX will be compiled according to assembler support
> > LASX/LSX or not, so it can be compiled with old versions of binutils.
> 
> Yes I am aware and this seems odd, albeit not really wrong.  It means that
> you will get less code coverage and optimizations depending of the used 
> binutils. 
> 
> I would advise to follow what other architecture did to provide arch-specific 
> optimization, which is either setup a minimum gcc/binutils version (for 
> instance aarch64 libmvec), or encode the instructions in a binutils neutral
> mode (as the powerpc implementation I pointed out).

Hmm, this policy seems different from $OTHER_PROJECTS.
  
Xi Ruoyao Aug. 3, 2023, 2:59 p.m. UTC | #10
On Thu, 2023-08-03 at 22:53 +0800, Xi Ruoyao wrote:
> > 
> > I would advise to follow what other architecture did to provide
> > arch-specific 
> > optimization, which is either setup a minimum gcc/binutils version
> > (for 
> > instance aarch64 libmvec), or encode the instructions in a binutils
> > neutral
> > mode (as the powerpc implementation I pointed out).
> 
> Hmm, this policy seems different from $OTHER_PROJECTS.

BTW I guess we should start to document some general rules about
machine-specific optimizations in
https://sourceware.org/glibc/wiki/Consensus or somewhere.
  
Adhemerval Zanella Netto Aug. 3, 2023, 4:29 p.m. UTC | #11
On 03/08/23 11:59, Xi Ruoyao wrote:
> On Thu, 2023-08-03 at 22:53 +0800, Xi Ruoyao wrote:
>>>
>>> I would advise to follow what other architecture did to provide
>>> arch-specific 
>>> optimization, which is either setup a minimum gcc/binutils version
>>> (for 
>>> instance aarch64 libmvec), or encode the instructions in a binutils
>>> neutral
>>> mode (as the powerpc implementation I pointed out).
>>
>> Hmm, this policy seems different from $OTHER_PROJECTS.


We don't have a strict policy regarding it, but I think having less
configuration permutations to test helps in maintainability.  For instance,
with the --enable-fortify-source I had to test 2/3 *times* more build
permutation to see if every architecture did build for all supported
gcc version.  

> 
> BTW I guess we should start to document some general rules about
> machine-specific optimizations in
> https://sourceware.org/glibc/wiki/Consensus or somewhere.
> 

In fact my understanding is arch arch-maintainer may define how to proceed
in this way.
  
caiyinyu Aug. 4, 2023, 1:50 a.m. UTC | #12
.....
>>> Refer to the latest test results in the chart: The assembly
>>> implementation vs. generic strlen implementation(compiled by using
>>> CFLAGS-strlen.c += -funroll-all-loops --param
>>> max-variable-expandsions-in-unroller=2) the performance
>>> improvement of the assembly implementation is evident(30% ~ 40%),
>>> especially in cases when the length is greater than 64 bytes.
>>> Please kindly see the results via:
>>> https://github.com/jiadengx/glibc_test/blob/main/strlen2/bench1/generic_strlen_with_loop_unrolling.png
>> So maybe use the generic implementation plus the compiler flags to loop
>> unrolling instead of asm optimization?
> This is strange... I remember I'd attempted to add #pragma GCC unroll
> for the main loop of strlen and I observed no performance gain on my
> Loongson-3A5000-HV, at all.  Maybe a different test environment
> (hardware, compiler version, or something)?

The name of his graph is ambiguous. What he means is that our assembly 
implementation performs better

than the generic code implementation (plus the compiler flags for loop 
unrolling),

and our assembly implementation improves performance by 30% to 40%,

especially in cases where the length is greater than 64 bytes.

https://github.com/jiadengx/glibc_test/blob/main/strlen2/bench1/generic_strlen_with_loop_unrolling.png

>
>>>>>>> This implementation fails to assembler with binutils 2.40.0.20230525:
>>>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages:
>>>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld  $vr0,$r4,0
>>>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld  $vr1,$r4,16
>>>>>>>
>>>>> Sorry, it's my mistake for the wrong version of binutils. Could you please try the latest release
>>>>> version 2.41?
>>>> Although it should work, it is unexpected that depending of the assembler used
>>>> some optimized routines are not enabled.
>>> In patch v2, an new configuration variable has been added to control
>>> whether the LASX/LSX will be compiled according to assembler support
>>> LASX/LSX or not, so it can be compiled with old versions of binutils.
>> Yes I am aware and this seems odd, albeit not really wrong.  It means that
>> you will get less code coverage and optimizations depending of the used
>> binutils.
>>
>> I would advise to follow what other architecture did to provide arch-specific
>> optimization, which is either setup a minimum gcc/binutils version (for
>> instance aarch64 libmvec), or encode the instructions in a binutils neutral
>> mode (as the powerpc implementation I pointed out).
> Hmm, this policy seems different from $OTHER_PROJECTS.

I prefer the first plan: setting a minimum version limit for gcc/binutils.


>
  
dengjianbo Aug. 4, 2023, 10 a.m. UTC | #13
On 2023-08-03 21:48, Adhemerval Zanella Netto wrote:
>>>>>> This implementation fails to assembler with binutils 2.40.0.20230525:
>>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages:
>>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld  $vr0,$r4,0
>>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld  $vr1,$r4,16
>>>>>>
>>>> Sorry, it's my mistake for the wrong version of binutils. Could you please try the latest release
>>>> version 2.41?
>>> Although it should work, it is unexpected that depending of the assembler used
>>> some optimized routines are not enabled. 
>> In patch v2, an new configuration variable has been added to control
>> whether the LASX/LSX will be compiled according to assembler support
>> LASX/LSX or not, so it can be compiled with old versions of binutils.
> Yes I am aware and this seems odd, albeit not really wrong.  It means that
> you will get less code coverage and optimizations depending of the used 
> binutils. 
>
> I would advise to follow what other architecture did to provide arch-specific 
> optimization, which is either setup a minimum gcc/binutils version (for 
> instance aarch64 libmvec), or encode the instructions in a binutils neutral
> mode (as the powerpc implementation I pointed out).

we have setup a minimun binutils version in patch v3, please kindly find
via:
https://sourceware.org/pipermail/libc-alpha/2023-August/150670.html
  

Patch

diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile
new file mode 100644
index 0000000000..529a8b6bab
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/Makefile
@@ -0,0 +1,3 @@ 
+ifeq ($(subdir),string)
+sysdep_routines += strlen-aligned strlen-lsx strlen-lasx
+endif
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
new file mode 100644
index 0000000000..b35e41127e
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c
@@ -0,0 +1,39 @@ 
+/* Enumerate available IFUNC implementations of a function.  LoongArch64 version.
+   Copyright (C) 2017-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <wchar.h>
+#include <ldsodefs.h>
+#include <ifunc-impl-list.h>
+#include <stdio.h>
+
+size_t
+__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+			size_t max)
+{
+
+  size_t i = max;
+
+  IFUNC_IMPL (i, name, strlen,
+	      IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, __strlen_lasx)
+	      IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx)
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned)
+	      )
+  return i;
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
new file mode 100644
index 0000000000..e2b3490f39
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h
@@ -0,0 +1,36 @@ 
+/* Common definition for strlen implementation.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <ldsodefs.h>
+#include <ifunc-init.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  if (SUPPORT_LASX)
+    return OPTIMIZE (lasx);
+  else if (SUPPORT_LSX)
+    return OPTIMIZE (lsx);
+  else
+    return OPTIMIZE (aligned);
+}
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
new file mode 100644
index 0000000000..b379e978a7
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S
@@ -0,0 +1,101 @@ 
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+# define STRLEN __strlen_aligned
+#else
+# define STRLEN strlen
+#endif
+
+LEAF(STRLEN, 6)
+    move        a1, a0
+    bstrins.d   a0, zero, 2, 0
+    lu12i.w     a2, 0x01010
+    li.w        t0, -1
+
+    ld.d        t2, a0, 0
+    andi        t1, a1, 0x7
+    ori         a2, a2, 0x101
+    slli.d      t1, t1, 3
+
+    bstrins.d   a2, a2, 63, 32
+    sll.d       t1, t0, t1
+    slli.d      t3, a2, 7
+    nor         a3, zero, t3
+
+    orn         t2, t2, t1
+    sub.d       t0, t2, a2
+    nor         t1, t2, a3
+    and         t0, t0, t1
+
+
+    bnez        t0, L(count_pos)
+    addi.d      a0, a0, 8
+L(loop_16_7bit):
+    ld.d        t2, a0, 0
+    sub.d       t1, t2, a2
+
+    and         t0, t1, t3
+    bnez        t0, L(more_check)
+    ld.d        t2, a0, 8
+    sub.d       t1, t2, a2
+
+    and         t0, t1, t3
+    addi.d      a0, a0, 16
+    beqz        t0, L(loop_16_7bit)
+    addi.d      a0, a0, -8
+
+L(more_check):
+    nor         t0, t2, a3
+    and         t0, t1, t0
+    bnez        t0, L(count_pos)
+    addi.d      a0, a0, 8
+
+
+L(loop_16_8bit):
+    ld.d        t2, a0, 0
+    sub.d       t1, t2, a2
+    nor         t0, t2, a3
+    and         t0, t0, t1
+
+    bnez        t0, L(count_pos)
+    ld.d        t2, a0, 8
+    addi.d      a0, a0, 16
+    sub.d       t1, t2, a2
+
+    nor         t0, t2, a3
+    and         t0, t0, t1
+    beqz        t0, L(loop_16_8bit)
+    addi.d      a0, a0, -8
+
+L(count_pos):
+    ctz.d       t1, t0
+    sub.d       a0, a0, a1
+    srli.d      t1, t1, 3
+    add.d       a0, a0, t1
+
+    jr          ra
+END(STRLEN)
+
+#ifdef _LIBC
+libc_hidden_builtin_def (STRLEN)
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
new file mode 100644
index 0000000000..56ac6403d3
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S
@@ -0,0 +1,65 @@ 
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+
+# define STRLEN __strlen_lasx
+
+LEAF(STRLEN, 6)
+    move            a1, a0
+    bstrins.d       a0, zero, 4, 0
+    li.d            t1, -1
+    xvld            xr0, a0, 0
+
+    xvmsknz.b       xr0, xr0
+    xvpickve.w      xr1, xr0, 4
+    vilvl.h         vr0, vr1, vr0
+    movfr2gr.s      t0, fa0  # sign extend
+
+    sra.w           t0, t0, a1
+    beq             t0, t1, L(loop)
+    cto.w           a0, t0
+    jr              ra
+
+L(loop):
+    xvld            xr0, a0, 32
+    addi.d          a0, a0, 32
+    xvsetanyeqz.b   fcc0, xr0
+    bceqz           fcc0, L(loop)
+
+
+    xvmsknz.b       xr0, xr0
+    sub.d           a0, a0, a1
+    xvpickve.w      xr1, xr0, 4
+    vilvl.h         vr0, vr1, vr0
+
+    movfr2gr.s      t0, fa0
+    cto.w           t0, t0
+    add.d           a0, a0, t0
+    jr              ra
+END(STRLEN)
+
+#ifdef _LIBC
+libc_hidden_builtin_def (STRLEN)
+#endif
+
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
new file mode 100644
index 0000000000..1c19c98b5b
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S
@@ -0,0 +1,73 @@ 
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/regdef.h>
+#include <sys/asm.h>
+
+#if IS_IN (libc)
+
+# define STRLEN __strlen_lsx
+
+LEAF(STRLEN, 6)
+    move            a1, a0
+    bstrins.d       a0, zero, 4, 0
+    vld             vr0, a0, 0
+    vld             vr1, a0, 16
+
+    li.d            t1, -1
+    vmsknz.b        vr0, vr0
+    vmsknz.b        vr1, vr1
+    vilvl.h         vr0, vr1, vr0
+
+    movfr2gr.s      t0, fa0
+    sra.w           t0, t0, a1
+    beq             t0, t1, L(loop)
+    cto.w           a0, t0
+
+    jr              ra
+    nop
+    nop
+    nop
+
+
+L(loop):
+    vld             vr0, a0, 32
+    vld             vr1, a0, 48
+    addi.d          a0, a0, 32
+    vmin.bu         vr2, vr0, vr1
+
+    vsetanyeqz.b    fcc0, vr2
+    bceqz           fcc0, L(loop)
+    vmsknz.b        vr0, vr0
+    vmsknz.b        vr1, vr1
+
+    vilvl.h         vr0, vr1, vr0
+    sub.d           a0, a0, a1
+    movfr2gr.s      t0, fa0
+    cto.w           t0, t0
+
+    add.d           a0, a0, t0
+    jr              ra
+END(STRLEN)
+
+#ifdef _LIBC
+libc_hidden_builtin_def (STRLEN)
+#endif
+
+#endif
diff --git a/sysdeps/loongarch/lp64/multiarch/strlen.c b/sysdeps/loongarch/lp64/multiarch/strlen.c
new file mode 100644
index 0000000000..416ed0d9e2
--- /dev/null
+++ b/sysdeps/loongarch/lp64/multiarch/strlen.c
@@ -0,0 +1,37 @@ 
+/* Multiple versions of strlen.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+# define strlen __redirect_strlen
+# include <string.h>
+# undef strlen
+
+# define SYMBOL_NAME strlen
+# include "ifunc-strlen.h"
+
+libc_ifunc_redirected (__redirect_strlen, strlen, IFUNC_SELECTOR ());
+
+# ifdef SHARED
+__hidden_ver1 (strlen, __GI_strlen, __redirect_strlen)
+  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlen);
+# endif
+
+#endif
diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h
index 5100f36d24..524d2e3277 100644
--- a/sysdeps/loongarch/sys/regdef.h
+++ b/sysdeps/loongarch/sys/regdef.h
@@ -89,6 +89,14 @@ 
 #define fs5 $f29
 #define fs6 $f30
 #define fs7 $f31
+#define fcc0 $fcc0
+#define fcc1 $fcc1
+#define fcc2 $fcc2
+#define fcc3 $fcc3
+#define fcc4 $fcc4
+#define fcc5 $fcc5
+#define fcc6 $fcc6
+#define fcc7 $fcc7
 
 #define vr0 $vr0
 #define vr1 $vr1
@@ -98,6 +106,30 @@ 
 #define vr5 $vr5
 #define vr6 $vr6
 #define vr7 $vr7
+#define vr8 $vr8
+#define vr9 $vr9
+#define vr10 $vr10
+#define vr11 $vr11
+#define vr12 $vr12
+#define vr13 $vr13
+#define vr14 $vr14
+#define vr15 $vr15
+#define vr16 $vr16
+#define vr17 $vr17
+#define vr18 $vr18
+#define vr19 $vr19
+#define vr20 $vr20
+#define vr21 $vr21
+#define vr22 $vr22
+#define vr23 $vr23
+#define vr24 $vr24
+#define vr25 $vr25
+#define vr26 $vr26
+#define vr27 $vr27
+#define vr28 $vr28
+#define vr29 $vr29
+#define vr30 $vr30
+#define vr31 $vr31
 
 #define xr0 $xr0
 #define xr1 $xr1
@@ -107,5 +139,30 @@ 
 #define xr5 $xr5
 #define xr6 $xr6
 #define xr7 $xr7
+#define xr7 $xr7
+#define xr8 $xr8
+#define xr9 $xr9
+#define xr10 $xr10
+#define xr11 $xr11
+#define xr12 $xr12
+#define xr13 $xr13
+#define xr14 $xr14
+#define xr15 $xr15
+#define xr16 $xr16
+#define xr17 $xr17
+#define xr18 $xr18
+#define xr19 $xr19
+#define xr20 $xr20
+#define xr21 $xr21
+#define xr22 $xr22
+#define xr23 $xr23
+#define xr24 $xr24
+#define xr25 $xr25
+#define xr26 $xr26
+#define xr27 $xr27
+#define xr28 $xr28
+#define xr29 $xr29
+#define xr30 $xr30
+#define xr31 $xr31
 
 #endif /* _SYS_REGDEF_H */
diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
index e371e13b15..d1a280a5ee 100644
--- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h
@@ -25,5 +25,7 @@ 
 #define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX)
 #define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX)
 
+#define INIT_ARCH()
+
 #endif /* _CPU_FEATURES_LOONGARCH64_H  */