[3/6] riscv: add vectorized memcmp
Commit Message
The vector implementation uses m8 register grouping and processes data in
vector-length chunks, providing significant performance improvements on
RVV-capable hardware. Use conditional compilation to fall back to the
generic implementation when __riscv_vector is not available, maintaining
compatibility with non-vector RISC-V systems.
Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
---
newlib/libc/machine/riscv/Makefile.inc | 2 ++
newlib/libc/machine/riscv/memcmp-asm.S | 43 ++++++++++++++++++++++++++
newlib/libc/machine/riscv/memcmp.c | 5 +++
3 files changed, 50 insertions(+)
create mode 100644 newlib/libc/machine/riscv/memcmp-asm.S
create mode 100644 newlib/libc/machine/riscv/memcmp.c
Comments
Hi Pincheng:
> diff --git a/newlib/libc/machine/riscv/memcmp-asm.S b/newlib/libc/machine/riscv/memcmp-asm.S
> new file mode 100644
> index 000000000..8614f66e4
> --- /dev/null
> +++ b/newlib/libc/machine/riscv/memcmp-asm.S
> @@ -0,0 +1,43 @@
> +#if defined(__riscv_vector) && !defined(__OPTIMIZE_SIZE__) && !defined(PREFER_SIZE_OVER_SPEED)
> +.text
> +.option push
> +.option arch, +zve32x
> +.global memcmp
> +.type memcmp, @function
> +memcmp:
> +#if __riscv_landing_pad
> + lpad 0
> +#endif
> + beqz a2, .Lequal
> +.Lloop:
> + vsetvli a3, a2, e8, m8, ta, ma
> +
> + vle8.v v0, (a0)
> + vle8.v v8, (a1)
> +
> + vmsne.vv v16, v0, v8
> + sub a2, a2, a3
> + vfirst.m a4, v16
> +
> + bgez a4, .Lfound
> +
> + add a0, a0, a3
> + add a1, a1, a3
> +
> + bnez a2, .Lloop
> +
> +.Lequal:
> + li a0, 0
> + ret
> +.Lfound:
> + vrgather.vx v16, v0, a4
> + vrgather.vx v24, v8, a4
I would like to avoid those two vrgather here, we can use the result
of vfirst.m and did some arithmetic to get same value as well,
vrgather is a powerful instruction but it might slow.
> + vmv.x.s a0, v16
> + vmv.x.s a4, v24
Also that could prevent those two vec reg -> GPR move, that might be
relative expensive in some uarch.
> + andi a0, a0, 0xff
> + andi a4, a4, 0xff
> + sub a0, a0, a4
> + ret
> +.size memcmp, .-memcmp
> +.option pop
> +#endif
@@ -5,6 +5,8 @@ libc_a_SOURCES += \
%D%/memccpy.c \
%D%/memchr-asm.S \
%D%/memchr.c \
+ %D%/memcmp-asm.S \
+ %D%/memcmp.c \
%D%/memcpy-asm.S \
%D%/memcpy.c \
%D%/memmove-asm.S \
new file mode 100644
@@ -0,0 +1,43 @@
+#if defined(__riscv_vector) && !defined(__OPTIMIZE_SIZE__) && !defined(PREFER_SIZE_OVER_SPEED)
+.text
+.option push
+.option arch, +zve32x
+.global memcmp
+.type memcmp, @function
+memcmp:
+#if __riscv_landing_pad
+ lpad 0
+#endif
+ beqz a2, .Lequal
+.Lloop:
+ vsetvli a3, a2, e8, m8, ta, ma
+
+ vle8.v v0, (a0)
+ vle8.v v8, (a1)
+
+ vmsne.vv v16, v0, v8
+ sub a2, a2, a3
+ vfirst.m a4, v16
+
+ bgez a4, .Lfound
+
+ add a0, a0, a3
+ add a1, a1, a3
+
+ bnez a2, .Lloop
+
+.Lequal:
+ li a0, 0
+ ret
+.Lfound:
+ vrgather.vx v16, v0, a4
+ vrgather.vx v24, v8, a4
+ vmv.x.s a0, v16
+ vmv.x.s a4, v24
+ andi a0, a0, 0xff
+ andi a4, a4, 0xff
+ sub a0, a0, a4
+ ret
+.size memcmp, .-memcmp
+.option pop
+#endif
new file mode 100644
@@ -0,0 +1,5 @@
+#if defined(__OPTIMIZE_SIZE__) || defined(PREFER_SIZE_OVER_SPEED) || !defined(__riscv_vector)
+# include "../../string/memcmp.c"
+#else
+/* memcmp defined in memcmp-asm.S */
+#endif