The vector implementation processes data in vector-length chunks,
providing significant performance improvements on RVV-capable hardware.
Use conditional compilation to fall back to the scalar implementation
when __riscv_vector is not available, maintaining compatibility with
non-vector RISC-V systems.
Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
---
newlib/libc/machine/riscv/Makefile.inc | 1 +
newlib/libc/machine/riscv/memrchr-asm.S | 40 +++++++++++++++++++++++++
newlib/libc/machine/riscv/memrchr.c | 4 +++
3 files changed, 45 insertions(+)
create mode 100644 newlib/libc/machine/riscv/memrchr-asm.S
@@ -13,6 +13,7 @@ libc_a_SOURCES += \
%D%/memmove.c \
%D%/mempcpy-asm.S \
%D%/mempcpy.c \
+ %D%/memrchr-asm.S \
%D%/memrchr.c \
%D%/memset.S \
%D%/setjmp.S \
new file mode 100644
@@ -0,0 +1,40 @@
+#if defined(__riscv_vector) && !defined(__OPTIMIZE_SIZE__) && !defined(PREFER_SIZE_OVER_SPEED)
+.text
+.option push
+.option arch, +zve32x
+.global memrchr
+.type memrchr, @function
+memrchr:
+#if __riscv_landing_pad
+ lpad 0
+#endif
+ andi a1, a1, 0xff
+ add a0, a0, a2
+.Lloop:
+ beqz a2, .Lnohit
+
+ vsetvli a3, a2, e8, m4, ta, ma
+ sub a0, a0, a3
+ vle8.v v8, (a0)
+ sub a2, a2, a3
+
+ vmseq.vx v0, v8, a1
+ vfirst.m a4, v0
+ bltz a4, .Lloop
+
+ vsetvli zero, a3, e16, m8, ta, ma
+ vid.v v16
+
+ /* Reduce max index over hit lanes. */
+ vmv.s.x v24, x0
+ vredmaxu.vs v24, v16, v24, v0.t
+ vmv.x.s a5, v24
+
+ add a0, a0, a5
+ ret
+.Lnohit:
+ li a0, 0
+ ret
+.size memrchr, .-memrchr
+.option pop
+#endif
@@ -28,6 +28,9 @@ PORTABILITY
QUICKREF
memrchr
*/
+#if defined(__riscv_vector) && !defined(__OPTIMIZE_SIZE__) && !defined(PREFER_SIZE_OVER_SPEED)
+/* memrchr defined in memrchr-asm.S */
+#else
#include <sys/asm.h>
#include <stddef.h>
@@ -170,3 +173,4 @@ memrchr (const void *src_void,
return NULL;
}
+#endif