[v7,2/2] RISC-V: add riscv vector support for memcpy

Message ID 20250221095740.582183-3-daichengrong@iscas.ac.cn (mailing list archive)
State New
Headers
Series RISC-V: add multiarch RVV support for memcpy using FMV IFUNC |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed
redhat-pt-bot/TryBot-still_applies warning Patch no longer applies to master

Commit Message

daichengrong Feb. 21, 2025, 9:57 a.m. UTC
  From: daichengrong <daichengrong@iscas.ac.cn>

Change in v7:
   update dl_hwcap support using kernel header
   update rvv memcpy compile with option,+v
   delete optimization for small lengths

Change in v6:
   Optimize the RVV memcpy for small lengths less than VLEN/8 bytes

Changes in v5:
   check ifunc-impl-list memcpy vector support with by dl_hwcap

Changes in v4:
   update rvv memcpy support by compiler 
   check whether rvv enabled by dl_hwcap

Changes in v2:
   delete size-0 branch
---
 sysdeps/riscv/multiarch/memcpy_vector.S       | 37 +++++++++++++++++++
 .../unix/sysv/linux/riscv/multiarch/Makefile  |  6 +++
 .../linux/riscv/multiarch/ifunc-impl-list.c   | 14 +++++++
 .../unix/sysv/linux/riscv/multiarch/memcpy.c  |  8 ++++
 4 files changed, 65 insertions(+)
 create mode 100644 sysdeps/riscv/multiarch/memcpy_vector.S
  

Comments

Anton Blanchard Feb. 22, 2025, 9:11 a.m. UTC | #1
Hi,

On Fri, Feb 21, 2025 at 8:57 PM <daichengrong@iscas.ac.cn> wrote:
> From: daichengrong <daichengrong@iscas.ac.cn>
>
> Change in v7:
>    update dl_hwcap support using kernel header
>    update rvv memcpy compile with option,+v
>    delete optimization for small lengths

I tested this again on our upcoming Tenstorrent Ascalon CPU and (as
before) it's a 3x speedup on long lengths, a clear improvement for
everything 16 bytes onwards and no significant regressions on lengths
less than 16 bytes.

Tested-by: Anton Blanchard <antonb@tenstorrent.com>

Anton

> Change in v6:
>    Optimize the RVV memcpy for small lengths less than VLEN/8 bytes
>
> Changes in v5:
>    check ifunc-impl-list memcpy vector support with by dl_hwcap
>
> Changes in v4:
>    update rvv memcpy support by compiler
>    check whether rvv enabled by dl_hwcap
>
> Changes in v2:
>    delete size-0 branch
> ---
>  sysdeps/riscv/multiarch/memcpy_vector.S       | 37 +++++++++++++++++++
>  .../unix/sysv/linux/riscv/multiarch/Makefile  |  6 +++
>  .../linux/riscv/multiarch/ifunc-impl-list.c   | 14 +++++++
>  .../unix/sysv/linux/riscv/multiarch/memcpy.c  |  8 ++++
>  4 files changed, 65 insertions(+)
>  create mode 100644 sysdeps/riscv/multiarch/memcpy_vector.S
>
> diff --git a/sysdeps/riscv/multiarch/memcpy_vector.S b/sysdeps/riscv/multiarch/memcpy_vector.S
> new file mode 100644
> index 0000000000..eaf28aaf6d
> --- /dev/null
> +++ b/sysdeps/riscv/multiarch/memcpy_vector.S
> @@ -0,0 +1,37 @@
> +/* memcpy for RISC-V Vector.
> +   Copyright (C) 2024-2025 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +#include <sys/asm.h>
> +
> +ENTRY (__memcpy_vector)
> +.option push
> +.option arch, +v
> +    mv     a6, a0
> +L(loop):
> +    vsetvli a3,a2,e8,m8,ta,ma
> +    vle8.v  v8,(a1)
> +    vse8.v  v8,(a6)
> +    add     a1,a1,a3
> +    sub     a2,a2,a3
> +    add     a6,a6,a3
> +    bnez    a2,L(loop)
> +    ret
> +.option pop
> +END (__memcpy_vector)
> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
> index fcef5659d4..478338006b 100644
> --- a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
> @@ -5,5 +5,11 @@ sysdep_routines += \
>    memcpy_noalignment \
>    # sysdep_routines
>
> +ifeq ($(have-gcc-riscv-rvv),yes)
> +sysdep_routines += \
> +  memcpy_vector \
> +  # rvv sysdep_routines
> +endif
> +
>  CFLAGS-memcpy_noalignment.c += -mno-strict-align
>  endif
> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
> index 1c1deca8f6..26f3376d23 100644
> --- a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
> @@ -19,6 +19,8 @@
>  #include <ifunc-impl-list.h>
>  #include <string.h>
>  #include <sys/hwprobe.h>
> +#include <ldsodefs.h>
> +#include <asm/hwcap.h>
>
>  size_t
>  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> @@ -27,6 +29,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    size_t i = max;
>
>    bool fast_unaligned = false;
> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
> +  bool rvv_ext = false;
> +#endif
>
>    struct riscv_hwprobe pair = { .key = RISCV_HWPROBE_KEY_CPUPERF_0 };
>    if (__riscv_hwprobe (&pair, 1, 0, NULL, 0) == 0
> @@ -34,7 +39,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>            == RISCV_HWPROBE_MISALIGNED_FAST)
>      fast_unaligned = true;
>
> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
> +  if (GLRO(dl_hwcap) & COMPAT_HWCAP_ISA_V)
> +    rvv_ext = true;
> +#endif
> +
>    IFUNC_IMPL (i, name, memcpy,
> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
> +             IFUNC_IMPL_ADD (array, i, memcpy, rvv_ext,
> +                             __memcpy_vector)
> +#endif
>               IFUNC_IMPL_ADD (array, i, memcpy, fast_unaligned,
>                               __memcpy_noalignment)
>               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
> index 8544f5402a..4bedd21866 100644
> --- a/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
> @@ -27,16 +27,24 @@
>  # include <ifunc-init.h>
>  # include <riscv-ifunc.h>
>  # include <sys/hwprobe.h>
> +# include <asm/hwcap.h>
>
>  extern __typeof (__redirect_memcpy) __libc_memcpy;
>
>  extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_noalignment attribute_hidden;
> +extern __typeof (__redirect_memcpy) __memcpy_vector attribute_hidden;
>
>  static inline __typeof (__redirect_memcpy) *
>  select_memcpy_ifunc (uint64_t dl_hwcap, __riscv_hwprobe_t hwprobe_func)
>  {
>    unsigned long long int v;
> +
> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
> +  if (dl_hwcap & COMPAT_HWCAP_ISA_V)
> +    return __memcpy_vector;
> +#endif
> +
>    if (__riscv_hwprobe_one (hwprobe_func, RISCV_HWPROBE_KEY_CPUPERF_0, &v) == 0
>        && (v & RISCV_HWPROBE_MISALIGNED_MASK) == RISCV_HWPROBE_MISALIGNED_FAST)
>      return __memcpy_noalignment;
> --
> 2.25.1
>
>
  
daichengrong March 7, 2025, 9:24 a.m. UTC | #2
> On Fri, Feb 21, 2025 at 8:57 PM <daichengrong@iscas.ac.cn> wrote:
>> From: daichengrong <daichengrong@iscas.ac.cn>
>>
>> Change in v7:
>>     update dl_hwcap support using kernel header
>>     update rvv memcpy compile with option,+v
>>     delete optimization for small lengths
> I tested this again on our upcoming Tenstorrent Ascalon CPU and (as
> before) it's a 3x speedup on long lengths, a clear improvement for
> everything 16 bytes onwards and no significant regressions on lengths
> less than 16 bytes.
>
> Tested-by: Anton Blanchard <antonb@tenstorrent.com>
>
> Anton

On banana pi-F3(256bit RVV),it's a 3x speedup on long lengths.
And on K230(128bit RVV),it gets a 2x speedup on long lengths

Tested-by: daichengrogn<daichengrong@iscas.ac.cn>

>> Change in v6:
>>     Optimize the RVV memcpy for small lengths less than VLEN/8 bytes
>>
>> Changes in v5:
>>     check ifunc-impl-list memcpy vector support with by dl_hwcap
>>
>> Changes in v4:
>>     update rvv memcpy support by compiler
>>     check whether rvv enabled by dl_hwcap
>>
>> Changes in v2:
>>     delete size-0 branch
>> ---
>>   sysdeps/riscv/multiarch/memcpy_vector.S       | 37 +++++++++++++++++++
>>   .../unix/sysv/linux/riscv/multiarch/Makefile  |  6 +++
>>   .../linux/riscv/multiarch/ifunc-impl-list.c   | 14 +++++++
>>   .../unix/sysv/linux/riscv/multiarch/memcpy.c  |  8 ++++
>>   4 files changed, 65 insertions(+)
>>   create mode 100644 sysdeps/riscv/multiarch/memcpy_vector.S
>>
>> diff --git a/sysdeps/riscv/multiarch/memcpy_vector.S b/sysdeps/riscv/multiarch/memcpy_vector.S
>> new file mode 100644
>> index 0000000000..eaf28aaf6d
>> --- /dev/null
>> +++ b/sysdeps/riscv/multiarch/memcpy_vector.S
>> @@ -0,0 +1,37 @@
>> +/* memcpy for RISC-V Vector.
>> +   Copyright (C) 2024-2025 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +
>> +#include <sysdep.h>
>> +#include <sys/asm.h>
>> +
>> +ENTRY (__memcpy_vector)
>> +.option push
>> +.option arch, +v
>> +    mv     a6, a0
>> +L(loop):
>> +    vsetvli a3,a2,e8,m8,ta,ma
>> +    vle8.v  v8,(a1)
>> +    vse8.v  v8,(a6)
>> +    add     a1,a1,a3
>> +    sub     a2,a2,a3
>> +    add     a6,a6,a3
>> +    bnez    a2,L(loop)
>> +    ret
>> +.option pop
>> +END (__memcpy_vector)
>> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
>> index fcef5659d4..478338006b 100644
>> --- a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
>> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
>> @@ -5,5 +5,11 @@ sysdep_routines += \
>>     memcpy_noalignment \
>>     # sysdep_routines
>>
>> +ifeq ($(have-gcc-riscv-rvv),yes)
>> +sysdep_routines += \
>> +  memcpy_vector \
>> +  # rvv sysdep_routines
>> +endif
>> +
>>   CFLAGS-memcpy_noalignment.c += -mno-strict-align
>>   endif
>> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
>> index 1c1deca8f6..26f3376d23 100644
>> --- a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
>> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
>> @@ -19,6 +19,8 @@
>>   #include <ifunc-impl-list.h>
>>   #include <string.h>
>>   #include <sys/hwprobe.h>
>> +#include <ldsodefs.h>
>> +#include <asm/hwcap.h>
>>
>>   size_t
>>   __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> @@ -27,6 +29,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>>     size_t i = max;
>>
>>     bool fast_unaligned = false;
>> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
>> +  bool rvv_ext = false;
>> +#endif
>>
>>     struct riscv_hwprobe pair = { .key = RISCV_HWPROBE_KEY_CPUPERF_0 };
>>     if (__riscv_hwprobe (&pair, 1, 0, NULL, 0) == 0
>> @@ -34,7 +39,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>>             == RISCV_HWPROBE_MISALIGNED_FAST)
>>       fast_unaligned = true;
>>
>> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
>> +  if (GLRO(dl_hwcap) & COMPAT_HWCAP_ISA_V)
>> +    rvv_ext = true;
>> +#endif
>> +
>>     IFUNC_IMPL (i, name, memcpy,
>> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
>> +             IFUNC_IMPL_ADD (array, i, memcpy, rvv_ext,
>> +                             __memcpy_vector)
>> +#endif
>>                IFUNC_IMPL_ADD (array, i, memcpy, fast_unaligned,
>>                                __memcpy_noalignment)
>>                IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
>> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
>> index 8544f5402a..4bedd21866 100644
>> --- a/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
>> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
>> @@ -27,16 +27,24 @@
>>   # include <ifunc-init.h>
>>   # include <riscv-ifunc.h>
>>   # include <sys/hwprobe.h>
>> +# include <asm/hwcap.h>
>>
>>   extern __typeof (__redirect_memcpy) __libc_memcpy;
>>
>>   extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
>>   extern __typeof (__redirect_memcpy) __memcpy_noalignment attribute_hidden;
>> +extern __typeof (__redirect_memcpy) __memcpy_vector attribute_hidden;
>>
>>   static inline __typeof (__redirect_memcpy) *
>>   select_memcpy_ifunc (uint64_t dl_hwcap, __riscv_hwprobe_t hwprobe_func)
>>   {
>>     unsigned long long int v;
>> +
>> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
>> +  if (dl_hwcap & COMPAT_HWCAP_ISA_V)
>> +    return __memcpy_vector;
>> +#endif
>> +
>>     if (__riscv_hwprobe_one (hwprobe_func, RISCV_HWPROBE_KEY_CPUPERF_0, &v) == 0
>>         && (v & RISCV_HWPROBE_MISALIGNED_MASK) == RISCV_HWPROBE_MISALIGNED_FAST)
>>       return __memcpy_noalignment;
>> --
>> 2.25.1
>>
>>
  
Vineet Gupta Aug. 16, 2025, 4:41 a.m. UTC | #3
On 2/22/25 01:11, Anton Blanchard wrote:
> Hi,
>
> On Fri, Feb 21, 2025 at 8:57 PM <daichengrong@iscas.ac.cn> wrote:
>> From: daichengrong <daichengrong@iscas.ac.cn>
>>
>> Change in v7:
>>    update dl_hwcap support using kernel header
>>    update rvv memcpy compile with option,+v
>>    delete optimization for small lengths
> I tested this again on our upcoming Tenstorrent Ascalon CPU and (as
> before) it's a 3x speedup on long lengths, a clear improvement for
> everything 16 bytes onwards and no significant regressions on lengths
> less than 16 bytes.
>
> Tested-by: Anton Blanchard <antonb@tenstorrent.com>

Hi Palmer,

Is this patch series blocked by something ?

Thx,
-Vineet

>
> Anton
>
>> Change in v6:
>>    Optimize the RVV memcpy for small lengths less than VLEN/8 bytes
>>
>> Changes in v5:
>>    check ifunc-impl-list memcpy vector support with by dl_hwcap
>>
>> Changes in v4:
>>    update rvv memcpy support by compiler
>>    check whether rvv enabled by dl_hwcap
>>
>> Changes in v2:
>>    delete size-0 branch
>> ---
>>  sysdeps/riscv/multiarch/memcpy_vector.S       | 37 +++++++++++++++++++
>>  .../unix/sysv/linux/riscv/multiarch/Makefile  |  6 +++
>>  .../linux/riscv/multiarch/ifunc-impl-list.c   | 14 +++++++
>>  .../unix/sysv/linux/riscv/multiarch/memcpy.c  |  8 ++++
>>  4 files changed, 65 insertions(+)
>>  create mode 100644 sysdeps/riscv/multiarch/memcpy_vector.S
>>
>> diff --git a/sysdeps/riscv/multiarch/memcpy_vector.S b/sysdeps/riscv/multiarch/memcpy_vector.S
>> new file mode 100644
>> index 0000000000..eaf28aaf6d
>> --- /dev/null
>> +++ b/sysdeps/riscv/multiarch/memcpy_vector.S
>> @@ -0,0 +1,37 @@
>> +/* memcpy for RISC-V Vector.
>> +   Copyright (C) 2024-2025 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +
>> +#include <sysdep.h>
>> +#include <sys/asm.h>
>> +
>> +ENTRY (__memcpy_vector)
>> +.option push
>> +.option arch, +v
>> +    mv     a6, a0
>> +L(loop):
>> +    vsetvli a3,a2,e8,m8,ta,ma
>> +    vle8.v  v8,(a1)
>> +    vse8.v  v8,(a6)
>> +    add     a1,a1,a3
>> +    sub     a2,a2,a3
>> +    add     a6,a6,a3
>> +    bnez    a2,L(loop)
>> +    ret
>> +.option pop
>> +END (__memcpy_vector)
>> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
>> index fcef5659d4..478338006b 100644
>> --- a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
>> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
>> @@ -5,5 +5,11 @@ sysdep_routines += \
>>    memcpy_noalignment \
>>    # sysdep_routines
>>
>> +ifeq ($(have-gcc-riscv-rvv),yes)
>> +sysdep_routines += \
>> +  memcpy_vector \
>> +  # rvv sysdep_routines
>> +endif
>> +
>>  CFLAGS-memcpy_noalignment.c += -mno-strict-align
>>  endif
>> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
>> index 1c1deca8f6..26f3376d23 100644
>> --- a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
>> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
>> @@ -19,6 +19,8 @@
>>  #include <ifunc-impl-list.h>
>>  #include <string.h>
>>  #include <sys/hwprobe.h>
>> +#include <ldsodefs.h>
>> +#include <asm/hwcap.h>
>>
>>  size_t
>>  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> @@ -27,6 +29,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>>    size_t i = max;
>>
>>    bool fast_unaligned = false;
>> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
>> +  bool rvv_ext = false;
>> +#endif
>>
>>    struct riscv_hwprobe pair = { .key = RISCV_HWPROBE_KEY_CPUPERF_0 };
>>    if (__riscv_hwprobe (&pair, 1, 0, NULL, 0) == 0
>> @@ -34,7 +39,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>>            == RISCV_HWPROBE_MISALIGNED_FAST)
>>      fast_unaligned = true;
>>
>> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
>> +  if (GLRO(dl_hwcap) & COMPAT_HWCAP_ISA_V)
>> +    rvv_ext = true;
>> +#endif
>> +
>>    IFUNC_IMPL (i, name, memcpy,
>> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
>> +             IFUNC_IMPL_ADD (array, i, memcpy, rvv_ext,
>> +                             __memcpy_vector)
>> +#endif
>>               IFUNC_IMPL_ADD (array, i, memcpy, fast_unaligned,
>>                               __memcpy_noalignment)
>>               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
>> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
>> index 8544f5402a..4bedd21866 100644
>> --- a/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
>> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
>> @@ -27,16 +27,24 @@
>>  # include <ifunc-init.h>
>>  # include <riscv-ifunc.h>
>>  # include <sys/hwprobe.h>
>> +# include <asm/hwcap.h>
>>
>>  extern __typeof (__redirect_memcpy) __libc_memcpy;
>>
>>  extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
>>  extern __typeof (__redirect_memcpy) __memcpy_noalignment attribute_hidden;
>> +extern __typeof (__redirect_memcpy) __memcpy_vector attribute_hidden;
>>
>>  static inline __typeof (__redirect_memcpy) *
>>  select_memcpy_ifunc (uint64_t dl_hwcap, __riscv_hwprobe_t hwprobe_func)
>>  {
>>    unsigned long long int v;
>> +
>> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
>> +  if (dl_hwcap & COMPAT_HWCAP_ISA_V)
>> +    return __memcpy_vector;
>> +#endif
>> +
>>    if (__riscv_hwprobe_one (hwprobe_func, RISCV_HWPROBE_KEY_CPUPERF_0, &v) == 0
>>        && (v & RISCV_HWPROBE_MISALIGNED_MASK) == RISCV_HWPROBE_MISALIGNED_FAST)
>>      return __memcpy_noalignment;
>> --
>> 2.25.1
>>
>>
  
Palmer Dabbelt Aug. 16, 2025, 4:55 p.m. UTC | #4
On Fri, 15 Aug 2025 21:41:13 PDT (-0700), Vineet Gupta wrote:
>
>
> On 2/22/25 01:11, Anton Blanchard wrote:
>> Hi,
>>
>> On Fri, Feb 21, 2025 at 8:57 PM <daichengrong@iscas.ac.cn> wrote:
>>> From: daichengrong <daichengrong@iscas.ac.cn>
>>>
>>> Change in v7:
>>>    update dl_hwcap support using kernel header
>>>    update rvv memcpy compile with option,+v
>>>    delete optimization for small lengths
>> I tested this again on our upcoming Tenstorrent Ascalon CPU and (as
>> before) it's a 3x speedup on long lengths, a clear improvement for
>> everything 16 bytes onwards and no significant regressions on lengths
>> less than 16 bytes.
>>
>> Tested-by: Anton Blanchard <antonb@tenstorrent.com>
>
> Hi Palmer,
>
> Is this patch series blocked by something ?

Jeff said he had a better version of this and was going to send it.

>
> Thx,
> -Vineet
>
>>
>> Anton
>>
>>> Change in v6:
>>>    Optimize the RVV memcpy for small lengths less than VLEN/8 bytes
>>>
>>> Changes in v5:
>>>    check ifunc-impl-list memcpy vector support with by dl_hwcap
>>>
>>> Changes in v4:
>>>    update rvv memcpy support by compiler
>>>    check whether rvv enabled by dl_hwcap
>>>
>>> Changes in v2:
>>>    delete size-0 branch
>>> ---
>>>  sysdeps/riscv/multiarch/memcpy_vector.S       | 37 +++++++++++++++++++
>>>  .../unix/sysv/linux/riscv/multiarch/Makefile  |  6 +++
>>>  .../linux/riscv/multiarch/ifunc-impl-list.c   | 14 +++++++
>>>  .../unix/sysv/linux/riscv/multiarch/memcpy.c  |  8 ++++
>>>  4 files changed, 65 insertions(+)
>>>  create mode 100644 sysdeps/riscv/multiarch/memcpy_vector.S
>>>
>>> diff --git a/sysdeps/riscv/multiarch/memcpy_vector.S b/sysdeps/riscv/multiarch/memcpy_vector.S
>>> new file mode 100644
>>> index 0000000000..eaf28aaf6d
>>> --- /dev/null
>>> +++ b/sysdeps/riscv/multiarch/memcpy_vector.S
>>> @@ -0,0 +1,37 @@
>>> +/* memcpy for RISC-V Vector.
>>> +   Copyright (C) 2024-2025 Free Software Foundation, Inc.
>>> +   This file is part of the GNU C Library.
>>> +
>>> +   The GNU C Library is free software; you can redistribute it and/or
>>> +   modify it under the terms of the GNU Lesser General Public
>>> +   License as published by the Free Software Foundation; either
>>> +   version 2.1 of the License, or (at your option) any later version.
>>> +
>>> +   The GNU C Library is distributed in the hope that it will be useful,
>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +   Lesser General Public License for more details.
>>> +
>>> +   You should have received a copy of the GNU Lesser General Public
>>> +   License along with the GNU C Library; if not, see
>>> +   <https://www.gnu.org/licenses/>.  */
>>> +
>>> +
>>> +#include <sysdep.h>
>>> +#include <sys/asm.h>
>>> +
>>> +ENTRY (__memcpy_vector)
>>> +.option push
>>> +.option arch, +v
>>> +    mv     a6, a0
>>> +L(loop):
>>> +    vsetvli a3,a2,e8,m8,ta,ma
>>> +    vle8.v  v8,(a1)
>>> +    vse8.v  v8,(a6)
>>> +    add     a1,a1,a3
>>> +    sub     a2,a2,a3
>>> +    add     a6,a6,a3
>>> +    bnez    a2,L(loop)
>>> +    ret
>>> +.option pop
>>> +END (__memcpy_vector)
>>> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
>>> index fcef5659d4..478338006b 100644
>>> --- a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
>>> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
>>> @@ -5,5 +5,11 @@ sysdep_routines += \
>>>    memcpy_noalignment \
>>>    # sysdep_routines
>>>
>>> +ifeq ($(have-gcc-riscv-rvv),yes)
>>> +sysdep_routines += \
>>> +  memcpy_vector \
>>> +  # rvv sysdep_routines
>>> +endif
>>> +
>>>  CFLAGS-memcpy_noalignment.c += -mno-strict-align
>>>  endif
>>> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
>>> index 1c1deca8f6..26f3376d23 100644
>>> --- a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
>>> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
>>> @@ -19,6 +19,8 @@
>>>  #include <ifunc-impl-list.h>
>>>  #include <string.h>
>>>  #include <sys/hwprobe.h>
>>> +#include <ldsodefs.h>
>>> +#include <asm/hwcap.h>
>>>
>>>  size_t
>>>  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>>> @@ -27,6 +29,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>>>    size_t i = max;
>>>
>>>    bool fast_unaligned = false;
>>> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
>>> +  bool rvv_ext = false;
>>> +#endif
>>>
>>>    struct riscv_hwprobe pair = { .key = RISCV_HWPROBE_KEY_CPUPERF_0 };
>>>    if (__riscv_hwprobe (&pair, 1, 0, NULL, 0) == 0
>>> @@ -34,7 +39,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>>>            == RISCV_HWPROBE_MISALIGNED_FAST)
>>>      fast_unaligned = true;
>>>
>>> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
>>> +  if (GLRO(dl_hwcap) & COMPAT_HWCAP_ISA_V)
>>> +    rvv_ext = true;
>>> +#endif
>>> +
>>>    IFUNC_IMPL (i, name, memcpy,
>>> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
>>> +             IFUNC_IMPL_ADD (array, i, memcpy, rvv_ext,
>>> +                             __memcpy_vector)
>>> +#endif
>>>               IFUNC_IMPL_ADD (array, i, memcpy, fast_unaligned,
>>>                               __memcpy_noalignment)
>>>               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
>>> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
>>> index 8544f5402a..4bedd21866 100644
>>> --- a/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
>>> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
>>> @@ -27,16 +27,24 @@
>>>  # include <ifunc-init.h>
>>>  # include <riscv-ifunc.h>
>>>  # include <sys/hwprobe.h>
>>> +# include <asm/hwcap.h>
>>>
>>>  extern __typeof (__redirect_memcpy) __libc_memcpy;
>>>
>>>  extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
>>>  extern __typeof (__redirect_memcpy) __memcpy_noalignment attribute_hidden;
>>> +extern __typeof (__redirect_memcpy) __memcpy_vector attribute_hidden;
>>>
>>>  static inline __typeof (__redirect_memcpy) *
>>>  select_memcpy_ifunc (uint64_t dl_hwcap, __riscv_hwprobe_t hwprobe_func)
>>>  {
>>>    unsigned long long int v;
>>> +
>>> +#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT)
>>> +  if (dl_hwcap & COMPAT_HWCAP_ISA_V)
>>> +    return __memcpy_vector;
>>> +#endif
>>> +
>>>    if (__riscv_hwprobe_one (hwprobe_func, RISCV_HWPROBE_KEY_CPUPERF_0, &v) == 0
>>>        && (v & RISCV_HWPROBE_MISALIGNED_MASK) == RISCV_HWPROBE_MISALIGNED_FAST)
>>>      return __memcpy_noalignment;
>>> --
>>> 2.25.1
>>>
>>>
  

Patch

diff --git a/sysdeps/riscv/multiarch/memcpy_vector.S b/sysdeps/riscv/multiarch/memcpy_vector.S
new file mode 100644
index 0000000000..eaf28aaf6d
--- /dev/null
+++ b/sysdeps/riscv/multiarch/memcpy_vector.S
@@ -0,0 +1,37 @@ 
+/* memcpy for RISC-V Vector.
+   Copyright (C) 2024-2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+ENTRY (__memcpy_vector) 
+.option push
+.option arch, +v
+    mv	    a6, a0
+L(loop):
+    vsetvli a3,a2,e8,m8,ta,ma
+    vle8.v  v8,(a1)
+    vse8.v  v8,(a6)
+    add     a1,a1,a3
+    sub     a2,a2,a3 
+    add     a6,a6,a3 
+    bnez    a2,L(loop)  
+    ret
+.option pop
+END (__memcpy_vector)
diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
index fcef5659d4..478338006b 100644
--- a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
+++ b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
@@ -5,5 +5,11 @@  sysdep_routines += \
   memcpy_noalignment \
   # sysdep_routines
 
+ifeq ($(have-gcc-riscv-rvv),yes)
+sysdep_routines += \
+  memcpy_vector \
+  # rvv sysdep_routines
+endif
+
 CFLAGS-memcpy_noalignment.c += -mno-strict-align
 endif
diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
index 1c1deca8f6..26f3376d23 100644
--- a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
+++ b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
@@ -19,6 +19,8 @@ 
 #include <ifunc-impl-list.h>
 #include <string.h>
 #include <sys/hwprobe.h>
+#include <ldsodefs.h>
+#include <asm/hwcap.h>
 
 size_t
 __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
@@ -27,6 +29,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   size_t i = max;
 
   bool fast_unaligned = false;
+#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT) 
+  bool rvv_ext = false;
+#endif
 
   struct riscv_hwprobe pair = { .key = RISCV_HWPROBE_KEY_CPUPERF_0 };
   if (__riscv_hwprobe (&pair, 1, 0, NULL, 0) == 0
@@ -34,7 +39,16 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
           == RISCV_HWPROBE_MISALIGNED_FAST)
     fast_unaligned = true;
 
+#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT) 
+  if (GLRO(dl_hwcap) & COMPAT_HWCAP_ISA_V) 
+    rvv_ext = true;
+#endif
+
   IFUNC_IMPL (i, name, memcpy,
+#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT) 
+	      IFUNC_IMPL_ADD (array, i, memcpy, rvv_ext,
+			      __memcpy_vector)
+#endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, fast_unaligned,
 			      __memcpy_noalignment)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
index 8544f5402a..4bedd21866 100644
--- a/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
+++ b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c
@@ -27,16 +27,24 @@ 
 # include <ifunc-init.h>
 # include <riscv-ifunc.h>
 # include <sys/hwprobe.h>
+# include <asm/hwcap.h>
 
 extern __typeof (__redirect_memcpy) __libc_memcpy;
 
 extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_noalignment attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_vector attribute_hidden;
 
 static inline __typeof (__redirect_memcpy) *
 select_memcpy_ifunc (uint64_t dl_hwcap, __riscv_hwprobe_t hwprobe_func)
 {
   unsigned long long int v;
+
+#if defined(HAVE_RISCV_ASM_VECTOR_SUPPORT) 
+  if (dl_hwcap & COMPAT_HWCAP_ISA_V) 
+    return __memcpy_vector;
+#endif
+
   if (__riscv_hwprobe_one (hwprobe_func, RISCV_HWPROBE_KEY_CPUPERF_0, &v) == 0
       && (v & RISCV_HWPROBE_MISALIGNED_MASK) == RISCV_HWPROBE_MISALIGNED_FAST)
     return __memcpy_noalignment;