[1/6] riscv: add vectorized memccpy

Message ID 20260513153835.213249-2-pincheng.plct@isrc.iscas.ac.cn
State New
Headers
Series riscv: add vectorized mem* routines |

Commit Message

Pincheng Wang May 13, 2026, 3:38 p.m. UTC
  The vector implementation uses m8 register grouping and processes data in
vector-length chunks, providing significant performance improvements on
RVV-capable hardware.  Use conditional compilation to fall back to the
generic implementation when __riscv_vector is not available, maintaining
compatibility with non-vector RISC-V systems.

Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
---
 newlib/libc/machine/riscv/Makefile.inc  | 18 ++++++++++--
 newlib/libc/machine/riscv/memccpy-asm.S | 39 +++++++++++++++++++++++++
 newlib/libc/machine/riscv/memccpy.c     |  5 ++++
 3 files changed, 60 insertions(+), 2 deletions(-)
 create mode 100644 newlib/libc/machine/riscv/memccpy-asm.S
 create mode 100644 newlib/libc/machine/riscv/memccpy.c
  

Comments

Kito Cheng May 22, 2026, 8:36 a.m. UTC | #1
Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn> 於 2026年5月13日週三 下午11:39寫道:
>
> The vector implementation uses m8 register grouping and processes data in
> vector-length chunks, providing significant performance improvements on
> RVV-capable hardware.  Use conditional compilation to fall back to the
> generic implementation when __riscv_vector is not available, maintaining
> compatibility with non-vector RISC-V systems.
>
> Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
> ---
>  newlib/libc/machine/riscv/Makefile.inc  | 18 ++++++++++--
>  newlib/libc/machine/riscv/memccpy-asm.S | 39 +++++++++++++++++++++++++
>  newlib/libc/machine/riscv/memccpy.c     |  5 ++++
>  3 files changed, 60 insertions(+), 2 deletions(-)
>  create mode 100644 newlib/libc/machine/riscv/memccpy-asm.S
>  create mode 100644 newlib/libc/machine/riscv/memccpy.c
>
> diff --git a/newlib/libc/machine/riscv/Makefile.inc b/newlib/libc/machine/riscv/Makefile.inc
> index 3cc6e198f..676608aa5 100644
> --- a/newlib/libc/machine/riscv/Makefile.inc
> +++ b/newlib/libc/machine/riscv/Makefile.inc
> @@ -1,3 +1,17 @@
>  libc_a_SOURCES += \
> -       %D%/memmove-asm.S %D%/memmove.c %D%/memset.S %D%/memcpy-asm.S %D%/memcpy.c %D%/strlen.c \
> -       %D%/strcpy.c %D%/stpcpy.c %D%/strcmp.S %D%/memchr.c %D%/memrchr.c %D%/setjmp.S %D%/ieeefp.c %D%/ffs.c
> +        %D%/ffs.c \
> +        %D%/ieeefp.c \
> +        %D%/memccpy-asm.S \
> +        %D%/memccpy.c \
> +        %D%/memchr.c \
> +        %D%/memcpy-asm.S \
> +        %D%/memcpy.c \
> +        %D%/memmove-asm.S \
> +        %D%/memmove.c \
> +        %D%/memrchr.c \
> +        %D%/memset.S \
> +        %D%/setjmp.S \
> +        %D%/stpcpy.c \
> +        %D%/strcmp.S \
> +        %D%/strcpy.c \
> +        %D%/strlen.c
> diff --git a/newlib/libc/machine/riscv/memccpy-asm.S b/newlib/libc/machine/riscv/memccpy-asm.S
> new file mode 100644
> index 000000000..3c33a8ae0
> --- /dev/null
> +++ b/newlib/libc/machine/riscv/memccpy-asm.S
> @@ -0,0 +1,39 @@
> +#if defined(__riscv_vector) && !defined(__OPTIMIZE_SIZE__) && !defined(PREFER_SIZE_OVER_SPEED)
> +.text
> +.option push
> +.option arch, +zve32x

^^^ BTW we don't really option push option arch here since we already
guarded with __riscv_vector, that already guarantee we will have
zve32x anyway.

> +.global memccpy
> +.type memccpy, @function
> +memccpy:
> +#if __riscv_landing_pad
> +  lpad 0
> +#endif
> +  beqz a3, .Lnot_found
> +  andi a2, a2, 0xff
> +  mv a5, a0
> +.Lloop:
> +  vsetvli zero, a3, e8, m8, ta, ma
> +  vle8ff.v v0, (a1)

For all vle8ff, I would like to prevent it to use m8 as possible since
it did really read vlmax elements if does not hit any fault, and that
could lead very poor performance with small input on large VLEN
machine.

So here is two possible direction here:
1) Just use m1
2) Use complicate way to increase the LMUL each time, e.g. start from
mf2, then switch to m1, then m2, then m4...until m8.
  
Kito Cheng May 22, 2026, 8:50 a.m. UTC | #2
Kito Cheng <kito.cheng@gmail.com> 於 2026年5月22日週五 下午4:36寫道:
>
> Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn> 於 2026年5月13日週三 下午11:39寫道:
> >
> > The vector implementation uses m8 register grouping and processes data in
> > vector-length chunks, providing significant performance improvements on
> > RVV-capable hardware.  Use conditional compilation to fall back to the
> > generic implementation when __riscv_vector is not available, maintaining
> > compatibility with non-vector RISC-V systems.
> >
> > Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
> > ---
> >  newlib/libc/machine/riscv/Makefile.inc  | 18 ++++++++++--
> >  newlib/libc/machine/riscv/memccpy-asm.S | 39 +++++++++++++++++++++++++
> >  newlib/libc/machine/riscv/memccpy.c     |  5 ++++
> >  3 files changed, 60 insertions(+), 2 deletions(-)
> >  create mode 100644 newlib/libc/machine/riscv/memccpy-asm.S
> >  create mode 100644 newlib/libc/machine/riscv/memccpy.c
> >
> > diff --git a/newlib/libc/machine/riscv/Makefile.inc b/newlib/libc/machine/riscv/Makefile.inc
> > index 3cc6e198f..676608aa5 100644
> > --- a/newlib/libc/machine/riscv/Makefile.inc
> > +++ b/newlib/libc/machine/riscv/Makefile.inc
> > @@ -1,3 +1,17 @@
> >  libc_a_SOURCES += \
> > -       %D%/memmove-asm.S %D%/memmove.c %D%/memset.S %D%/memcpy-asm.S %D%/memcpy.c %D%/strlen.c \
> > -       %D%/strcpy.c %D%/stpcpy.c %D%/strcmp.S %D%/memchr.c %D%/memrchr.c %D%/setjmp.S %D%/ieeefp.c %D%/ffs.c
> > +        %D%/ffs.c \
> > +        %D%/ieeefp.c \
> > +        %D%/memccpy-asm.S \
> > +        %D%/memccpy.c \
> > +        %D%/memchr.c \
> > +        %D%/memcpy-asm.S \
> > +        %D%/memcpy.c \
> > +        %D%/memmove-asm.S \
> > +        %D%/memmove.c \
> > +        %D%/memrchr.c \
> > +        %D%/memset.S \
> > +        %D%/setjmp.S \
> > +        %D%/stpcpy.c \
> > +        %D%/strcmp.S \
> > +        %D%/strcpy.c \
> > +        %D%/strlen.c
> > diff --git a/newlib/libc/machine/riscv/memccpy-asm.S b/newlib/libc/machine/riscv/memccpy-asm.S
> > new file mode 100644
> > index 000000000..3c33a8ae0
> > --- /dev/null
> > +++ b/newlib/libc/machine/riscv/memccpy-asm.S
> > @@ -0,0 +1,39 @@
> > +#if defined(__riscv_vector) && !defined(__OPTIMIZE_SIZE__) && !defined(PREFER_SIZE_OVER_SPEED)
> > +.text
> > +.option push
> > +.option arch, +zve32x
>
> ^^^ BTW we don't really option push option arch here since we already
> guarded with __riscv_vector, that already guarantee we will have
> zve32x anyway.
>
> > +.global memccpy
> > +.type memccpy, @function
> > +memccpy:
> > +#if __riscv_landing_pad
> > +  lpad 0
> > +#endif
> > +  beqz a3, .Lnot_found
> > +  andi a2, a2, 0xff
> > +  mv a5, a0
> > +.Lloop:
> > +  vsetvli zero, a3, e8, m8, ta, ma
> > +  vle8ff.v v0, (a1)
>
> For all vle8ff, I would like to prevent it to use m8 as possible since
> it did really read vlmax elements if does not hit any fault, and that
> could lead very poor performance with small input on large VLEN
> machine.
>
> So here is two possible direction here:
> 1) Just use m1
> 2) Use complicate way to increase the LMUL each time, e.g. start from
> mf2, then switch to m1, then m2, then m4...until m8.

Wait, I realized it did have specify VL unlike strcmp, so withdraw
this comment :)
  

Patch

diff --git a/newlib/libc/machine/riscv/Makefile.inc b/newlib/libc/machine/riscv/Makefile.inc
index 3cc6e198f..676608aa5 100644
--- a/newlib/libc/machine/riscv/Makefile.inc
+++ b/newlib/libc/machine/riscv/Makefile.inc
@@ -1,3 +1,17 @@ 
 libc_a_SOURCES += \
-	%D%/memmove-asm.S %D%/memmove.c %D%/memset.S %D%/memcpy-asm.S %D%/memcpy.c %D%/strlen.c \
-	%D%/strcpy.c %D%/stpcpy.c %D%/strcmp.S %D%/memchr.c %D%/memrchr.c %D%/setjmp.S %D%/ieeefp.c %D%/ffs.c
+        %D%/ffs.c \
+        %D%/ieeefp.c \
+        %D%/memccpy-asm.S \
+        %D%/memccpy.c \
+        %D%/memchr.c \
+        %D%/memcpy-asm.S \
+        %D%/memcpy.c \
+        %D%/memmove-asm.S \
+        %D%/memmove.c \
+        %D%/memrchr.c \
+        %D%/memset.S \
+        %D%/setjmp.S \
+        %D%/stpcpy.c \
+        %D%/strcmp.S \
+        %D%/strcpy.c \
+        %D%/strlen.c
diff --git a/newlib/libc/machine/riscv/memccpy-asm.S b/newlib/libc/machine/riscv/memccpy-asm.S
new file mode 100644
index 000000000..3c33a8ae0
--- /dev/null
+++ b/newlib/libc/machine/riscv/memccpy-asm.S
@@ -0,0 +1,39 @@ 
+#if defined(__riscv_vector) && !defined(__OPTIMIZE_SIZE__) && !defined(PREFER_SIZE_OVER_SPEED)
+.text
+.option push
+.option arch, +zve32x
+.global memccpy
+.type memccpy, @function
+memccpy:
+#if __riscv_landing_pad
+  lpad 0
+#endif
+  beqz a3, .Lnot_found
+  andi a2, a2, 0xff
+  mv a5, a0
+.Lloop:
+  vsetvli zero, a3, e8, m8, ta, ma
+  vle8ff.v v0, (a1)
+
+  vmseq.vx v8, v0, a2
+  vfirst.m a6, v8
+  csrr a4, vl
+  bgez a6, .Lfound
+  vse8.v v0, (a5)
+
+  sub a3, a3, a4
+  add a1, a1, a4
+  add a5, a5, a4
+  bnez a3, .Lloop
+.Lnot_found:
+  mv a0, zero
+  ret
+.Lfound:
+  addi a6, a6, 1
+  vsetvli zero, a6, e8, m8, ta, ma
+  vse8.v v0, (a5)
+  add a0, a5, a6
+  ret
+.size memccpy, .-memccpy
+.option pop
+#endif
diff --git a/newlib/libc/machine/riscv/memccpy.c b/newlib/libc/machine/riscv/memccpy.c
new file mode 100644
index 000000000..7ca779d48
--- /dev/null
+++ b/newlib/libc/machine/riscv/memccpy.c
@@ -0,0 +1,5 @@ 
+#if defined(__OPTIMIZE_SIZE__) || defined(PREFER_SIZE_OVER_SPEED) || !defined(__riscv_vector)
+# include "../../string/memccpy.c"
+#else
+/* memccpy defined in memccpy-asm.S */
+#endif