[3/6] riscv: add vectorized memcmp

Message ID 20260513153835.213249-4-pincheng.plct@isrc.iscas.ac.cn
State New
Headers
Series riscv: add vectorized mem* routines |

Commit Message

Pincheng Wang May 13, 2026, 3:38 p.m. UTC
  The vector implementation uses m8 register grouping and processes data in
vector-length chunks, providing significant performance improvements on
RVV-capable hardware.  Use conditional compilation to fall back to the
generic implementation when __riscv_vector is not available, maintaining
compatibility with non-vector RISC-V systems.

Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
---
 newlib/libc/machine/riscv/Makefile.inc |  2 ++
 newlib/libc/machine/riscv/memcmp-asm.S | 43 ++++++++++++++++++++++++++
 newlib/libc/machine/riscv/memcmp.c     |  5 +++
 3 files changed, 50 insertions(+)
 create mode 100644 newlib/libc/machine/riscv/memcmp-asm.S
 create mode 100644 newlib/libc/machine/riscv/memcmp.c
  

Comments

Kito Cheng May 22, 2026, 8:31 a.m. UTC | #1
Hi Pincheng:

> diff --git a/newlib/libc/machine/riscv/memcmp-asm.S b/newlib/libc/machine/riscv/memcmp-asm.S
> new file mode 100644
> index 000000000..8614f66e4
> --- /dev/null
> +++ b/newlib/libc/machine/riscv/memcmp-asm.S
> @@ -0,0 +1,43 @@
> +#if defined(__riscv_vector) && !defined(__OPTIMIZE_SIZE__) && !defined(PREFER_SIZE_OVER_SPEED)
> +.text
> +.option push
> +.option arch, +zve32x
> +.global memcmp
> +.type memcmp, @function
> +memcmp:
> +#if __riscv_landing_pad
> +  lpad 0
> +#endif
> +  beqz a2, .Lequal
> +.Lloop:
> +  vsetvli a3, a2, e8, m8, ta, ma
> +
> +  vle8.v v0, (a0)
> +  vle8.v v8, (a1)
> +
> +  vmsne.vv v16, v0, v8
> +  sub a2, a2, a3
> +  vfirst.m a4, v16
> +
> +  bgez a4, .Lfound
> +
> +  add a0, a0, a3
> +  add a1, a1, a3
> +
> +  bnez a2, .Lloop
> +
> +.Lequal:
> +  li a0, 0
> +  ret
> +.Lfound:
> +  vrgather.vx v16, v0, a4
> +  vrgather.vx v24, v8, a4

I would like to avoid those two vrgather here, we can use the result
of vfirst.m and did some arithmetic to get same value as well,
vrgather is a powerful instruction but it might slow.

> +  vmv.x.s a0, v16
> +  vmv.x.s a4, v24

Also that could prevent those two vec reg -> GPR move, that might be
relative expensive in some uarch.

> +  andi a0, a0, 0xff
> +  andi a4, a4, 0xff
> +  sub a0, a0, a4
> +  ret
> +.size memcmp, .-memcmp
> +.option pop
> +#endif
  

Patch

diff --git a/newlib/libc/machine/riscv/Makefile.inc b/newlib/libc/machine/riscv/Makefile.inc
index fe5f07982..fbf87b3db 100644
--- a/newlib/libc/machine/riscv/Makefile.inc
+++ b/newlib/libc/machine/riscv/Makefile.inc
@@ -5,6 +5,8 @@  libc_a_SOURCES += \
         %D%/memccpy.c \
         %D%/memchr-asm.S \
         %D%/memchr.c \
+        %D%/memcmp-asm.S \
+        %D%/memcmp.c \
         %D%/memcpy-asm.S \
         %D%/memcpy.c \
         %D%/memmove-asm.S \
diff --git a/newlib/libc/machine/riscv/memcmp-asm.S b/newlib/libc/machine/riscv/memcmp-asm.S
new file mode 100644
index 000000000..8614f66e4
--- /dev/null
+++ b/newlib/libc/machine/riscv/memcmp-asm.S
@@ -0,0 +1,43 @@ 
+#if defined(__riscv_vector) && !defined(__OPTIMIZE_SIZE__) && !defined(PREFER_SIZE_OVER_SPEED)
+.text
+.option push
+.option arch, +zve32x
+.global memcmp
+.type memcmp, @function
+memcmp:
+#if __riscv_landing_pad
+  lpad 0
+#endif
+  beqz a2, .Lequal
+.Lloop:
+  vsetvli a3, a2, e8, m8, ta, ma
+
+  vle8.v v0, (a0)
+  vle8.v v8, (a1)
+
+  vmsne.vv v16, v0, v8
+  sub a2, a2, a3
+  vfirst.m a4, v16
+
+  bgez a4, .Lfound
+
+  add a0, a0, a3
+  add a1, a1, a3
+
+  bnez a2, .Lloop
+
+.Lequal:
+  li a0, 0
+  ret
+.Lfound:
+  vrgather.vx v16, v0, a4
+  vrgather.vx v24, v8, a4
+  vmv.x.s a0, v16
+  vmv.x.s a4, v24
+  andi a0, a0, 0xff
+  andi a4, a4, 0xff
+  sub a0, a0, a4
+  ret
+.size memcmp, .-memcmp
+.option pop
+#endif
diff --git a/newlib/libc/machine/riscv/memcmp.c b/newlib/libc/machine/riscv/memcmp.c
new file mode 100644
index 000000000..3e5613213
--- /dev/null
+++ b/newlib/libc/machine/riscv/memcmp.c
@@ -0,0 +1,5 @@ 
+#if defined(__OPTIMIZE_SIZE__) || defined(PREFER_SIZE_OVER_SPEED) || !defined(__riscv_vector)
+# include "../../string/memcmp.c"
+#else
+/* memcmp defined in memcmp-asm.S */
+#endif