[1/6] riscv: add vectorized memccpy
Commit Message
The vector implementation uses m8 register grouping and processes data in
vector-length chunks, providing significant performance improvements on
RVV-capable hardware. Use conditional compilation to fall back to the
generic implementation when __riscv_vector is not available, maintaining
compatibility with non-vector RISC-V systems.
Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
---
newlib/libc/machine/riscv/Makefile.inc | 18 ++++++++++--
newlib/libc/machine/riscv/memccpy-asm.S | 39 +++++++++++++++++++++++++
newlib/libc/machine/riscv/memccpy.c | 5 ++++
3 files changed, 60 insertions(+), 2 deletions(-)
create mode 100644 newlib/libc/machine/riscv/memccpy-asm.S
create mode 100644 newlib/libc/machine/riscv/memccpy.c
Comments
Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn> 於 2026年5月13日週三 下午11:39寫道:
>
> The vector implementation uses m8 register grouping and processes data in
> vector-length chunks, providing significant performance improvements on
> RVV-capable hardware. Use conditional compilation to fall back to the
> generic implementation when __riscv_vector is not available, maintaining
> compatibility with non-vector RISC-V systems.
>
> Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
> ---
> newlib/libc/machine/riscv/Makefile.inc | 18 ++++++++++--
> newlib/libc/machine/riscv/memccpy-asm.S | 39 +++++++++++++++++++++++++
> newlib/libc/machine/riscv/memccpy.c | 5 ++++
> 3 files changed, 60 insertions(+), 2 deletions(-)
> create mode 100644 newlib/libc/machine/riscv/memccpy-asm.S
> create mode 100644 newlib/libc/machine/riscv/memccpy.c
>
> diff --git a/newlib/libc/machine/riscv/Makefile.inc b/newlib/libc/machine/riscv/Makefile.inc
> index 3cc6e198f..676608aa5 100644
> --- a/newlib/libc/machine/riscv/Makefile.inc
> +++ b/newlib/libc/machine/riscv/Makefile.inc
> @@ -1,3 +1,17 @@
> libc_a_SOURCES += \
> - %D%/memmove-asm.S %D%/memmove.c %D%/memset.S %D%/memcpy-asm.S %D%/memcpy.c %D%/strlen.c \
> - %D%/strcpy.c %D%/stpcpy.c %D%/strcmp.S %D%/memchr.c %D%/memrchr.c %D%/setjmp.S %D%/ieeefp.c %D%/ffs.c
> + %D%/ffs.c \
> + %D%/ieeefp.c \
> + %D%/memccpy-asm.S \
> + %D%/memccpy.c \
> + %D%/memchr.c \
> + %D%/memcpy-asm.S \
> + %D%/memcpy.c \
> + %D%/memmove-asm.S \
> + %D%/memmove.c \
> + %D%/memrchr.c \
> + %D%/memset.S \
> + %D%/setjmp.S \
> + %D%/stpcpy.c \
> + %D%/strcmp.S \
> + %D%/strcpy.c \
> + %D%/strlen.c
> diff --git a/newlib/libc/machine/riscv/memccpy-asm.S b/newlib/libc/machine/riscv/memccpy-asm.S
> new file mode 100644
> index 000000000..3c33a8ae0
> --- /dev/null
> +++ b/newlib/libc/machine/riscv/memccpy-asm.S
> @@ -0,0 +1,39 @@
> +#if defined(__riscv_vector) && !defined(__OPTIMIZE_SIZE__) && !defined(PREFER_SIZE_OVER_SPEED)
> +.text
> +.option push
> +.option arch, +zve32x
^^^ BTW we don't really option push option arch here since we already
guarded with __riscv_vector, that already guarantee we will have
zve32x anyway.
> +.global memccpy
> +.type memccpy, @function
> +memccpy:
> +#if __riscv_landing_pad
> + lpad 0
> +#endif
> + beqz a3, .Lnot_found
> + andi a2, a2, 0xff
> + mv a5, a0
> +.Lloop:
> + vsetvli zero, a3, e8, m8, ta, ma
> + vle8ff.v v0, (a1)
For all vle8ff, I would like to prevent it to use m8 as possible since
it did really read vlmax elements if does not hit any fault, and that
could lead very poor performance with small input on large VLEN
machine.
So here is two possible direction here:
1) Just use m1
2) Use complicate way to increase the LMUL each time, e.g. start from
mf2, then switch to m1, then m2, then m4...until m8.
Kito Cheng <kito.cheng@gmail.com> 於 2026年5月22日週五 下午4:36寫道:
>
> Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn> 於 2026年5月13日週三 下午11:39寫道:
> >
> > The vector implementation uses m8 register grouping and processes data in
> > vector-length chunks, providing significant performance improvements on
> > RVV-capable hardware. Use conditional compilation to fall back to the
> > generic implementation when __riscv_vector is not available, maintaining
> > compatibility with non-vector RISC-V systems.
> >
> > Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
> > ---
> > newlib/libc/machine/riscv/Makefile.inc | 18 ++++++++++--
> > newlib/libc/machine/riscv/memccpy-asm.S | 39 +++++++++++++++++++++++++
> > newlib/libc/machine/riscv/memccpy.c | 5 ++++
> > 3 files changed, 60 insertions(+), 2 deletions(-)
> > create mode 100644 newlib/libc/machine/riscv/memccpy-asm.S
> > create mode 100644 newlib/libc/machine/riscv/memccpy.c
> >
> > diff --git a/newlib/libc/machine/riscv/Makefile.inc b/newlib/libc/machine/riscv/Makefile.inc
> > index 3cc6e198f..676608aa5 100644
> > --- a/newlib/libc/machine/riscv/Makefile.inc
> > +++ b/newlib/libc/machine/riscv/Makefile.inc
> > @@ -1,3 +1,17 @@
> > libc_a_SOURCES += \
> > - %D%/memmove-asm.S %D%/memmove.c %D%/memset.S %D%/memcpy-asm.S %D%/memcpy.c %D%/strlen.c \
> > - %D%/strcpy.c %D%/stpcpy.c %D%/strcmp.S %D%/memchr.c %D%/memrchr.c %D%/setjmp.S %D%/ieeefp.c %D%/ffs.c
> > + %D%/ffs.c \
> > + %D%/ieeefp.c \
> > + %D%/memccpy-asm.S \
> > + %D%/memccpy.c \
> > + %D%/memchr.c \
> > + %D%/memcpy-asm.S \
> > + %D%/memcpy.c \
> > + %D%/memmove-asm.S \
> > + %D%/memmove.c \
> > + %D%/memrchr.c \
> > + %D%/memset.S \
> > + %D%/setjmp.S \
> > + %D%/stpcpy.c \
> > + %D%/strcmp.S \
> > + %D%/strcpy.c \
> > + %D%/strlen.c
> > diff --git a/newlib/libc/machine/riscv/memccpy-asm.S b/newlib/libc/machine/riscv/memccpy-asm.S
> > new file mode 100644
> > index 000000000..3c33a8ae0
> > --- /dev/null
> > +++ b/newlib/libc/machine/riscv/memccpy-asm.S
> > @@ -0,0 +1,39 @@
> > +#if defined(__riscv_vector) && !defined(__OPTIMIZE_SIZE__) && !defined(PREFER_SIZE_OVER_SPEED)
> > +.text
> > +.option push
> > +.option arch, +zve32x
>
> ^^^ BTW we don't really option push option arch here since we already
> guarded with __riscv_vector, that already guarantee we will have
> zve32x anyway.
>
> > +.global memccpy
> > +.type memccpy, @function
> > +memccpy:
> > +#if __riscv_landing_pad
> > + lpad 0
> > +#endif
> > + beqz a3, .Lnot_found
> > + andi a2, a2, 0xff
> > + mv a5, a0
> > +.Lloop:
> > + vsetvli zero, a3, e8, m8, ta, ma
> > + vle8ff.v v0, (a1)
>
> For all vle8ff, I would like to prevent it to use m8 as possible since
> it did really read vlmax elements if does not hit any fault, and that
> could lead very poor performance with small input on large VLEN
> machine.
>
> So here is two possible direction here:
> 1) Just use m1
> 2) Use complicate way to increase the LMUL each time, e.g. start from
> mf2, then switch to m1, then m2, then m4...until m8.
Wait, I realized it did have specify VL unlike strcmp, so withdraw
this comment :)
@@ -1,3 +1,17 @@
libc_a_SOURCES += \
- %D%/memmove-asm.S %D%/memmove.c %D%/memset.S %D%/memcpy-asm.S %D%/memcpy.c %D%/strlen.c \
- %D%/strcpy.c %D%/stpcpy.c %D%/strcmp.S %D%/memchr.c %D%/memrchr.c %D%/setjmp.S %D%/ieeefp.c %D%/ffs.c
+ %D%/ffs.c \
+ %D%/ieeefp.c \
+ %D%/memccpy-asm.S \
+ %D%/memccpy.c \
+ %D%/memchr.c \
+ %D%/memcpy-asm.S \
+ %D%/memcpy.c \
+ %D%/memmove-asm.S \
+ %D%/memmove.c \
+ %D%/memrchr.c \
+ %D%/memset.S \
+ %D%/setjmp.S \
+ %D%/stpcpy.c \
+ %D%/strcmp.S \
+ %D%/strcpy.c \
+ %D%/strlen.c
new file mode 100644
@@ -0,0 +1,39 @@
+#if defined(__riscv_vector) && !defined(__OPTIMIZE_SIZE__) && !defined(PREFER_SIZE_OVER_SPEED)
+.text
+.option push
+.option arch, +zve32x
+.global memccpy
+.type memccpy, @function
+memccpy:
+#if __riscv_landing_pad
+ lpad 0
+#endif
+ beqz a3, .Lnot_found
+ andi a2, a2, 0xff
+ mv a5, a0
+.Lloop:
+ vsetvli zero, a3, e8, m8, ta, ma
+ vle8ff.v v0, (a1)
+
+ vmseq.vx v8, v0, a2
+ vfirst.m a6, v8
+ csrr a4, vl
+ bgez a6, .Lfound
+ vse8.v v0, (a5)
+
+ sub a3, a3, a4
+ add a1, a1, a4
+ add a5, a5, a4
+ bnez a3, .Lloop
+.Lnot_found:
+ mv a0, zero
+ ret
+.Lfound:
+ addi a6, a6, 1
+ vsetvli zero, a6, e8, m8, ta, ma
+ vse8.v v0, (a5)
+ add a0, a5, a6
+ ret
+.size memccpy, .-memccpy
+.option pop
+#endif
new file mode 100644
@@ -0,0 +1,5 @@
+#if defined(__OPTIMIZE_SIZE__) || defined(PREFER_SIZE_OVER_SPEED) || !defined(__riscv_vector)
+# include "../../string/memccpy.c"
+#else
+/* memccpy defined in memccpy-asm.S */
+#endif