[v2,1/1] riscv: add vectorized memset, memcpy and memmove

Message ID 20251225160856.16010-2-pincheng.plct@isrc.iscas.ac.cn
State New
Headers
Series riscv: add vectorized memset, memcpy and memmove |

Commit Message

Pincheng Wang Dec. 25, 2025, 4:08 p.m. UTC
  The vector implementations use m8 register grouping and process data in
vector-length chunks, providing significant performance improvements on
RVV-capable hardware. Use conditional compilation to fallback to scalar
implementations when __riscv_vector is not available, maintaining
compatibility with non-vector RISC-V systems.

Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
---
 newlib/libc/machine/riscv/memcpy-asm.S  | 54 +++++++++++++-
 newlib/libc/machine/riscv/memcpy.c      |  2 +-
 newlib/libc/machine/riscv/memmove-asm.S | 95 ++++++++++++++++++++++++-
 newlib/libc/machine/riscv/memmove.c     |  2 +-
 newlib/libc/machine/riscv/memset.S      | 21 +++++-
 5 files changed, 169 insertions(+), 5 deletions(-)
  

Patch

diff --git a/newlib/libc/machine/riscv/memcpy-asm.S b/newlib/libc/machine/riscv/memcpy-asm.S
index 2771285f9..a5f085f42 100644
--- a/newlib/libc/machine/riscv/memcpy-asm.S
+++ b/newlib/libc/machine/riscv/memcpy-asm.S
@@ -9,7 +9,59 @@ 
    http://www.opensource.org/licenses.
 */
 
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+#if defined(__riscv_vector)
+.text
+.global memcpy
+.type memcpy, @function
+.option push
+.option arch, +zve32x
+memcpy:
+  mv     t0, a0                   /* t0 = running dst */
+  mv     t1, a1                   /* t1 = running src */
+  beqz   a2, .Ldone               /* if n == 0, return */
+
+  /* Align dst to SZREG: skip when __riscv_misaligned_fast, else align */
+#ifndef __riscv_misaligned_fast
+  /* process small data directly with vectors, no alignment optimization */
+  li     t3, 32
+  bltu   a2, t3, .Lbulk_copy
+#if __riscv_xlen == 64
+  andi   t2, t0, 7                /* t2 = dst & 7 */
+  beqz   t2, .Lbulk_copy       /* already aligned to 8 bytes */
+  li     t4, 8
+  sub    t2, t4, t2               /* pad = 8 - (dst & 7) */
+#else
+  andi   t2, t0, 3                /* t2 = dst & 3 */
+  beqz   t2, .Lbulk_copy       /* already aligned to 4 bytes */
+  li     t4, 4
+  sub    t2, t4, t2               /* pad = 4 - (dst & 3) */
+#endif
+  /* copy prologue using vectors */
+  vsetvli t3, t2, e8, m8, ta, ma
+  vle8.v v0, (t1)
+  vse8.v v0, (t0)
+  add    t0, t0, t3
+  add    t1, t1, t3
+  sub    a2, a2, t3
+  beqz   a2, .Ldone
+#endif
+
+.Lbulk_copy:
+  vsetvli t2, a2, e8, m8, ta, ma
+  vle8.v v0, (t1)
+  vse8.v v0, (t0)
+  add    t0, t0, t2
+  add    t1, t1, t2
+  sub    a2, a2, t2
+  bnez   a2, .Lbulk_copy
+  /* fallthrough */
+
+.Ldone:
+  ret
+
+  .size memcpy, .-memcpy
+  .option pop
+#elif defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
 .text
 .global memcpy
 .type	memcpy, @function
diff --git a/newlib/libc/machine/riscv/memcpy.c b/newlib/libc/machine/riscv/memcpy.c
index a27e0ecb1..7fa0ff804 100644
--- a/newlib/libc/machine/riscv/memcpy.c
+++ b/newlib/libc/machine/riscv/memcpy.c
@@ -10,7 +10,7 @@ 
    http://www.opensource.org/licenses.
 */
 
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) || defined(__riscv_vector)
 // memcpy defined in memcpy-asm.S
 #else
 
diff --git a/newlib/libc/machine/riscv/memmove-asm.S b/newlib/libc/machine/riscv/memmove-asm.S
index 061472ca2..fea1741d2 100644
--- a/newlib/libc/machine/riscv/memmove-asm.S
+++ b/newlib/libc/machine/riscv/memmove-asm.S
@@ -9,7 +9,100 @@ 
    http://www.opensource.org/licenses.
 */
 
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+#if defined(__riscv_vector)
+.text
+.global memmove
+.type memmove, @function
+.option push
+.option arch, +zve32x
+memmove:
+  beqz   a2, .Ldone_move          /* n == 0 */
+  beq    a0, a1, .Ldone_move      /* dst == src */
+
+  /* overlap check */
+  bgeu   a1, a0, .Lforward_move   /* src >= dst then forward move */
+  sub    t2, a0, a1               /* t2 = dst - src */
+  bgeu   t2, a2, .Lforward_move   /* no overlap then forward move */
+
+  /* backward move */
+  add    t0, a0, a2               /* running dst_end */
+  add    t1, a1, a2               /* running src_end */
+  /* Align dst_end to SZREG: skip when __riscv_misaligned_fast, else align */
+#ifndef __riscv_misaligned_fast
+  /* process small data directly with vectors, no alignment optimization */
+  li     t3, 32
+  bltu   a2, t3, .Lbackward_loop
+
+#if __riscv_xlen == 64
+  andi   t2, t0, 7                /* misalignment = dst_end & 7 */
+#else
+  andi   t2, t0, 3                /* misalignment = dst_end & 3 */
+#endif
+  beqz   t2, .Lbackward_aligned   /* already aligned */
+  /* copy tail bytes to reach aligned dst_end */
+  vsetvli t3, t2, e8, m8, ta, ma
+  sub    t0, t0, t3
+  sub    t1, t1, t3
+  vle8.v v0, (t1)
+  vse8.v v0, (t0)
+  sub    a2, a2, t3
+.Lbackward_aligned:
+#endif
+.Lbackward_loop:
+  vsetvli t3, a2, e8, m8, ta, ma   /* t3 = vl (bytes) */
+  sub    t0, t0, t3
+  sub    t1, t1, t3
+  vle8.v v0, (t1)
+  vse8.v v0, (t0)
+  sub    a2, a2, t3
+  bnez   a2, .Lbackward_loop
+  ret
+
+  /* forward move, same as memcpy */
+.Lforward_move:
+  mv     t0, a0                   /* running dst */
+  mv     t1, a1                   /* running src */
+  /* Align dst to SZREG: skip when __riscv_misaligned_fast, else align */
+#ifndef __riscv_misaligned_fast
+  /* process small data directly with vectors, no alignment optimization */
+  li     t3, 32
+  bltu   a2, t3, .Lforward_loop
+
+#if __riscv_xlen == 64
+  andi   t2, t0, 7                /* t2 = dst & 7 */
+  beqz   t2, .Lforward_aligned    /* already aligned to 8 bytes */
+  li     t4, 8
+  sub    t2, t4, t2               /* pad = 8 - (dst & 7) */
+#else
+  andi   t2, t0, 3                /* t2 = dst & 3 */
+  beqz   t2, .Lforward_aligned    /* already aligned to 4 bytes */
+  li     t4, 4
+  sub    t2, t4, t2               /* pad = 4 - (dst & 3) */
+#endif
+  /* copy prologue using vectors */
+  vsetvli t3, t2, e8, m8, ta, ma
+  vle8.v v0, (t1)
+  vse8.v v0, (t0)
+  add    t0, t0, t3
+  add    t1, t1, t3
+  sub    a2, a2, t3
+.Lforward_aligned:
+#endif
+.Lforward_loop:
+  vsetvli t3, a2, e8, m8, ta, ma
+  vle8.v v0, (t1)
+  vse8.v v0, (t0)
+  add    t0, t0, t3
+  add    t1, t1, t3
+  sub    a2, a2, t3
+  bnez   a2, .Lforward_loop
+  /* fallthrough */
+
+.Ldone_move:
+  ret
+  .size memmove, .-memmove
+  .option pop
+#elif defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
 .text
 .global memmove
 .type	memmove, @function
diff --git a/newlib/libc/machine/riscv/memmove.c b/newlib/libc/machine/riscv/memmove.c
index 209a75c69..691774e2e 100644
--- a/newlib/libc/machine/riscv/memmove.c
+++ b/newlib/libc/machine/riscv/memmove.c
@@ -10,7 +10,7 @@ 
    http://www.opensource.org/licenses.
 */
 
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) || defined(__riscv_vector)
 /* memmove defined in memmove-asm.S */
 #else
 
diff --git a/newlib/libc/machine/riscv/memset.S b/newlib/libc/machine/riscv/memset.S
index 533f66758..9ade879f8 100644
--- a/newlib/libc/machine/riscv/memset.S
+++ b/newlib/libc/machine/riscv/memset.S
@@ -50,7 +50,26 @@ 
 
 
 memset:
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+#if defined(__riscv_vector)
+.option push
+.option arch, +zve32x
+  mv     t0, a0                    /* running dst; keep a0 as return */
+  beqz   a2, .Ldone_set            /* n == 0 then return */
+
+  /* Broadcast fill byte once. */
+  vsetvli t1, zero, e8, m8, ta, ma
+  vmv.v.x v0, a1
+
+.Lbulk_set:
+  vsetvli t1, a2, e8, m8, ta, ma   /* t1 = vl (bytes) */
+  vse8.v v0, (t0)
+  add    t0, t0, t1
+  sub    a2, a2, t1
+  bnez   a2, .Lbulk_set
+.Ldone_set:
+  ret
+.option pop
+#elif defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
   mv     a3, a0
   beqz   a2, .Ldone