[1/1] riscv: add vectorized memset, memcpy and memmove

Message ID 20251211132725.435742-2-pincheng.plct@isrc.iscas.ac.cn
State New
Headers
Series riscv: add vectorized memset, memcpy and memmove |

Commit Message

Pincheng Wang Dec. 11, 2025, 1:27 p.m. UTC
  The vector implementations use m8 register grouping and process data in
vector-length chunks, providing significant performance improvements on
RVV-capable hardware. Use conditional compilation to fallback to scalar
implementation when __riscv_v is not available, maintaining
compatibility with non-vector RISC-V systems.

Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
---
 newlib/libc/machine/riscv/memcpy-asm.S  | 23 +++++++++++-
 newlib/libc/machine/riscv/memcpy.c      |  2 +-
 newlib/libc/machine/riscv/memmove-asm.S | 47 ++++++++++++++++++++++++-
 newlib/libc/machine/riscv/memmove.c     |  2 +-
 newlib/libc/machine/riscv/memset.S      | 22 ++++++++++++
 5 files changed, 92 insertions(+), 4 deletions(-)
  

Comments

Kito Cheng Dec. 12, 2025, 8:33 a.m. UTC | #1
> diff --git a/newlib/libc/machine/riscv/memcpy-asm.S b/newlib/libc/machine/riscv/memcpy-asm.S
> index 2771285f9..9d1d2d4bd 100644
> --- a/newlib/libc/machine/riscv/memcpy-asm.S
> +++ b/newlib/libc/machine/riscv/memcpy-asm.S
> @@ -9,11 +9,11 @@
>     http://www.opensource.org/licenses.
>  */
>
> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>  .text
>  .global memcpy
>  .type  memcpy, @function
>  memcpy:
> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)

This seems not right change to me, memcpy-asm.S is NOT conditional
compile in the Makefile, so that mean if we didn't defined
PREFER_SIZE_OVER_SPEED, __OPTIMIZE_SIZE__ or __riscv_v, we will have a
empty memcpy in memcpy-asm.o and then this will included in libc.a

Same issue for memmove-asm.S

>    mv a3, a0
>    beqz a2, 2f
>
> @@ -29,4 +29,25 @@ memcpy:
>    ret
>
>    .size        memcpy, .-memcpy
> +#elif defined(__riscv_v)

Suggest use __riscv_vector rather than __riscv_v, so that we can also
use that logic for zve* extensions.

> +  .option push
> +  .option arch, +v

and arch, +zve32x here rather than +v

> +  mv      t0, a0                    /* running dst */
> +  mv      t1, a1                    /* running src */
> +  beqz    a2, .Ldone_copy           /* n == 0 then return */
> +
> +.Lbulk_copy:
> +  vsetvli t2, a2, e8, m8, ta, ma    /* t2 = vl (bytes) */
> +  vle8.v  v0, (t1)
> +  vse8.v  v0, (t0)
> +  add     t0, t0, t2
> +  add     t1, t1, t2
> +  sub     a2, a2, t2

This sub can be drop

> +  bnez    a2, .Lbulk_copy

You can use either src(a1)+len(a2) or dst(a0)+len(a2) for loop condition:

something like:

void *
memcpy(unsigned char *dst, const unsigned char *src,
                       const size_t sz)
{
 const unsigned char *end = dst + sz;
 while (dst != end)
   *dst++ = *src++;
  return dst;
}

This optimization could be applied on other function as well

> +  /* fallthrough */
> +
> +.Ldone_copy:
> +  ret
> +.size memcpy, .-memcpy
> +.option pop
>  #endif
> diff --git a/newlib/libc/machine/riscv/memcpy.c b/newlib/libc/machine/riscv/memcpy.c
> index a27e0ecb1..cd58c30a5 100644
> --- a/newlib/libc/machine/riscv/memcpy.c
> +++ b/newlib/libc/machine/riscv/memcpy.c
> @@ -10,7 +10,7 @@
>     http://www.opensource.org/licenses.
>  */
>
> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) || defined(__riscv_v)
>  // memcpy defined in memcpy-asm.S
>  #else
>
> diff --git a/newlib/libc/machine/riscv/memmove-asm.S b/newlib/libc/machine/riscv/memmove-asm.S
> index 061472ca2..5cc2e5143 100644
> --- a/newlib/libc/machine/riscv/memmove-asm.S
> +++ b/newlib/libc/machine/riscv/memmove-asm.S
> @@ -9,11 +9,11 @@
>     http://www.opensource.org/licenses.
>  */
>
> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>  .text
>  .global memmove
>  .type  memmove, @function
>  memmove:
> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>    beqz a2, .Ldone              /* in case there are 0 bytes to be copied, return immediately */
>
>    mv a4, a0                    /* copy the destination address over to a4, since memmove should return that address in a0 at the end */
> @@ -37,4 +37,49 @@ memmove:
>    ret
>
>    .size        memmove, .-memmove
> +#elif defined(__riscv_v)
> +  .option push
> +  .option arch, +v
> +  beqz    a2, .Ldone_move           /* n == 0 */
> +  beq     a0, a1, .Ldone_move       /* dst == src */
> +
> +  /* overlap check */
> +  bgeu    a1, a0, .Lforward_move    /* src >= dst then forward move*/
> +
> +  sub     t2, a0, a1                /* t2 = dst - src */
> +  bgeu    t2, a2, .Lforward_move    /* no overlap then forward move */
> +
> +  /* backward move */
> +  add     t0, a0, a2                /* running dst_end */
> +  add     t1, a1, a2                /* running src_end */
> +
> +.Lbackward_loop:
> +  vsetvli t3, a2, e8, m8, ta, ma    /* t3 = vl (bytes) */
> +  sub     t0, t0, t3
> +  sub     t1, t1, t3
> +  vle8.v  v0, (t1)
> +  vse8.v  v0, (t0)
> +  sub     a2, a2, t3
> +  bnez    a2, .Lbackward_loop
> +  j       .Ldone_move

`ret` rather than `j       .Ldone_move` here, ret and j are both one
instruction, so let just return to save one more jump :)
  
Pincheng Wang Dec. 16, 2025, 2:20 a.m. UTC | #2
Hi Kito,

Sorry for the late reply.

On 2025/12/12 16:33, Kito Cheng wrote:
>> diff --git a/newlib/libc/machine/riscv/memcpy-asm.S b/newlib/libc/machine/riscv/memcpy-asm.S
>> index 2771285f9..9d1d2d4bd 100644
>> --- a/newlib/libc/machine/riscv/memcpy-asm.S
>> +++ b/newlib/libc/machine/riscv/memcpy-asm.S
>> @@ -9,11 +9,11 @@
>>      http://www.opensource.org/licenses.
>>   */
>>
>> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>>   .text
>>   .global memcpy
>>   .type  memcpy, @function
>>   memcpy:
>> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> 
> This seems not right change to me, memcpy-asm.S is NOT conditional
> compile in the Makefile, so that mean if we didn't defined
> PREFER_SIZE_OVER_SPEED, __OPTIMIZE_SIZE__ or __riscv_v, we will have a
> empty memcpy in memcpy-asm.o and then this will included in libc.a
> 
> Same issue for memmove-asm.S
> 

My apologies for not implementing and testing the changes thoroughly 
enough. I'll move the guard back to its original position in the next 
revision.

>>     mv a3, a0
>>     beqz a2, 2f
>>
>> @@ -29,4 +29,25 @@ memcpy:
>>     ret
>>
>>     .size        memcpy, .-memcpy
>> +#elif defined(__riscv_v)
> 
> Suggest use __riscv_vector rather than __riscv_v, so that we can also
> use that logic for zve* extensions.
> 
>> +  .option push
>> +  .option arch, +v
> 
> and arch, +zve32x here rather than +v
> 

Will replace macro and arch,+v to support both full V and Zve* extensions.

>> +  mv      t0, a0                    /* running dst */
>> +  mv      t1, a1                    /* running src */
>> +  beqz    a2, .Ldone_copy           /* n == 0 then return */
>> +
>> +.Lbulk_copy:
>> +  vsetvli t2, a2, e8, m8, ta, ma    /* t2 = vl (bytes) */
>> +  vle8.v  v0, (t1)
>> +  vse8.v  v0, (t0)
>> +  add     t0, t0, t2
>> +  add     t1, t1, t2
>> +  sub     a2, a2, t2
> 
> This sub can be drop
> 
>> +  bnez    a2, .Lbulk_copy
> 
> You can use either src(a1)+len(a2) or dst(a0)+len(a2) for loop condition:
> 
> something like:
> 
> void *
> memcpy(unsigned char *dst, const unsigned char *src,
>                         const size_t sz)
> {
>   const unsigned char *end = dst + sz;
>   while (dst != end)
>     *dst++ = *src++;
>    return dst;
> }
> 
> This optimization could be applied on other function as well
> 

Thanks for the suggestion. Will restructure the loop conditions as you 
suggested.

>> +  /* fallthrough */
>> +
>> +.Ldone_copy:
>> +  ret
>> +.size memcpy, .-memcpy
>> +.option pop
>>   #endif
>> diff --git a/newlib/libc/machine/riscv/memcpy.c b/newlib/libc/machine/riscv/memcpy.c
>> index a27e0ecb1..cd58c30a5 100644
>> --- a/newlib/libc/machine/riscv/memcpy.c
>> +++ b/newlib/libc/machine/riscv/memcpy.c
>> @@ -10,7 +10,7 @@
>>      http://www.opensource.org/licenses.
>>   */
>>
>> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) || defined(__riscv_v)
>>   // memcpy defined in memcpy-asm.S
>>   #else
>>
>> diff --git a/newlib/libc/machine/riscv/memmove-asm.S b/newlib/libc/machine/riscv/memmove-asm.S
>> index 061472ca2..5cc2e5143 100644
>> --- a/newlib/libc/machine/riscv/memmove-asm.S
>> +++ b/newlib/libc/machine/riscv/memmove-asm.S
>> @@ -9,11 +9,11 @@
>>      http://www.opensource.org/licenses.
>>   */
>>
>> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>>   .text
>>   .global memmove
>>   .type  memmove, @function
>>   memmove:
>> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>>     beqz a2, .Ldone              /* in case there are 0 bytes to be copied, return immediately */
>>
>>     mv a4, a0                    /* copy the destination address over to a4, since memmove should return that address in a0 at the end */
>> @@ -37,4 +37,49 @@ memmove:
>>     ret
>>
>>     .size        memmove, .-memmove
>> +#elif defined(__riscv_v)
>> +  .option push
>> +  .option arch, +v
>> +  beqz    a2, .Ldone_move           /* n == 0 */
>> +  beq     a0, a1, .Ldone_move       /* dst == src */
>> +
>> +  /* overlap check */
>> +  bgeu    a1, a0, .Lforward_move    /* src >= dst then forward move*/
>> +
>> +  sub     t2, a0, a1                /* t2 = dst - src */
>> +  bgeu    t2, a2, .Lforward_move    /* no overlap then forward move */
>> +
>> +  /* backward move */
>> +  add     t0, a0, a2                /* running dst_end */
>> +  add     t1, a1, a2                /* running src_end */
>> +
>> +.Lbackward_loop:
>> +  vsetvli t3, a2, e8, m8, ta, ma    /* t3 = vl (bytes) */
>> +  sub     t0, t0, t3
>> +  sub     t1, t1, t3
>> +  vle8.v  v0, (t1)
>> +  vse8.v  v0, (t0)
>> +  sub     a2, a2, t3
>> +  bnez    a2, .Lbackward_loop
>> +  j       .Ldone_move
> 
> `ret` rather than `j       .Ldone_move` here, ret and j are both one
> instruction, so let just return to save one more jump :)

Absolutely.I'll change in the next revision.

Thank you very much for the detailed and thoughtful feedback. I really 
appreciate your guidance. I'll post v2 of the patch shortly.

Best regards,
Pincheng Wang
  
Christian Herber (OSS) Dec. 16, 2025, 9:36 a.m. UTC | #3
Hi Pincheng,

is there a clear advantage for using assembly over using intrinsics?

Christian

> -----Original Message-----
> From: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
> Sent: Tuesday, 16 December 2025 03:20
> To: Kito Cheng <kito.cheng@gmail.com>
> Cc: newlib@sourceware.org
> Subject: Re: [PATCH 1/1] riscv: add vectorized memset, memcpy and memmove
> 
> Hi Kito,
> 
> Sorry for the late reply.
> 
> On 2025/12/12 16:33, Kito Cheng wrote:
> >> diff --git a/newlib/libc/machine/riscv/memcpy-asm.S
> >> b/newlib/libc/machine/riscv/memcpy-asm.S
> >> index 2771285f9..9d1d2d4bd 100644
> >> --- a/newlib/libc/machine/riscv/memcpy-asm.S
> >> +++ b/newlib/libc/machine/riscv/memcpy-asm.S
> >> @@ -9,11 +9,11 @@
> >>      http://www.opensource.org/licenses.
> >>   */
> >>
> >> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> >>   .text
> >>   .global memcpy
> >>   .type  memcpy, @function
> >>   memcpy:
> >> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> >
> > This seems not right change to me, memcpy-asm.S is NOT conditional
> > compile in the Makefile, so that mean if we didn't defined
> > PREFER_SIZE_OVER_SPEED, __OPTIMIZE_SIZE__ or __riscv_v, we will have a
> > empty memcpy in memcpy-asm.o and then this will included in libc.a
> >
> > Same issue for memmove-asm.S
> >
> 
> My apologies for not implementing and testing the changes thoroughly enough.
> I'll move the guard back to its original position in the next revision.
> 
> >>     mv a3, a0
> >>     beqz a2, 2f
> >>
> >> @@ -29,4 +29,25 @@ memcpy:
> >>     ret
> >>
> >>     .size        memcpy, .-memcpy
> >> +#elif defined(__riscv_v)
> >
> > Suggest use __riscv_vector rather than __riscv_v, so that we can also
> > use that logic for zve* extensions.
> >
> >> +  .option push
> >> +  .option arch, +v
> >
> > and arch, +zve32x here rather than +v
> >
> 
> Will replace macro and arch,+v to support both full V and Zve* extensions.
> 
> >> +  mv      t0, a0                    /* running dst */
> >> +  mv      t1, a1                    /* running src */
> >> +  beqz    a2, .Ldone_copy           /* n == 0 then return */
> >> +
> >> +.Lbulk_copy:
> >> +  vsetvli t2, a2, e8, m8, ta, ma    /* t2 = vl (bytes) */
> >> +  vle8.v  v0, (t1)
> >> +  vse8.v  v0, (t0)
> >> +  add     t0, t0, t2
> >> +  add     t1, t1, t2
> >> +  sub     a2, a2, t2
> >
> > This sub can be drop
> >
> >> +  bnez    a2, .Lbulk_copy
> >
> > You can use either src(a1)+len(a2) or dst(a0)+len(a2) for loop condition:
> >
> > something like:
> >
> > void *
> > memcpy(unsigned char *dst, const unsigned char *src,
> >                         const size_t sz) {
> >   const unsigned char *end = dst + sz;
> >   while (dst != end)
> >     *dst++ = *src++;
> >    return dst;
> > }
> >
> > This optimization could be applied on other function as well
> >
> 
> Thanks for the suggestion. Will restructure the loop conditions as you suggested.
> 
> >> +  /* fallthrough */
> >> +
> >> +.Ldone_copy:
> >> +  ret
> >> +.size memcpy, .-memcpy
> >> +.option pop
> >>   #endif
> >> diff --git a/newlib/libc/machine/riscv/memcpy.c
> >> b/newlib/libc/machine/riscv/memcpy.c
> >> index a27e0ecb1..cd58c30a5 100644
> >> --- a/newlib/libc/machine/riscv/memcpy.c
> >> +++ b/newlib/libc/machine/riscv/memcpy.c
> >> @@ -10,7 +10,7 @@
> >>      http://www.opensource.org/licenses.
> >>   */
> >>
> >> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> >> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) ||
> >> +defined(__riscv_v)
> >>   // memcpy defined in memcpy-asm.S
> >>   #else
> >>
> >> diff --git a/newlib/libc/machine/riscv/memmove-asm.S
> >> b/newlib/libc/machine/riscv/memmove-asm.S
> >> index 061472ca2..5cc2e5143 100644
> >> --- a/newlib/libc/machine/riscv/memmove-asm.S
> >> +++ b/newlib/libc/machine/riscv/memmove-asm.S
> >> @@ -9,11 +9,11 @@
> >>      http://www.opensource.org/licenses.
> >>   */
> >>
> >> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> >>   .text
> >>   .global memmove
> >>   .type  memmove, @function
> >>   memmove:
> >> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> >>     beqz a2, .Ldone              /* in case there are 0 bytes to be copied, return
> immediately */
> >>
> >>     mv a4, a0                    /* copy the destination address over to a4, since
> memmove should return that address in a0 at the end */
> >> @@ -37,4 +37,49 @@ memmove:
> >>     ret
> >>
> >>     .size        memmove, .-memmove
> >> +#elif defined(__riscv_v)
> >> +  .option push
> >> +  .option arch, +v
> >> +  beqz    a2, .Ldone_move           /* n == 0 */
> >> +  beq     a0, a1, .Ldone_move       /* dst == src */
> >> +
> >> +  /* overlap check */
> >> +  bgeu    a1, a0, .Lforward_move    /* src >= dst then forward move*/
> >> +
> >> +  sub     t2, a0, a1                /* t2 = dst - src */
> >> +  bgeu    t2, a2, .Lforward_move    /* no overlap then forward move */
> >> +
> >> +  /* backward move */
> >> +  add     t0, a0, a2                /* running dst_end */
> >> +  add     t1, a1, a2                /* running src_end */
> >> +
> >> +.Lbackward_loop:
> >> +  vsetvli t3, a2, e8, m8, ta, ma    /* t3 = vl (bytes) */
> >> +  sub     t0, t0, t3
> >> +  sub     t1, t1, t3
> >> +  vle8.v  v0, (t1)
> >> +  vse8.v  v0, (t0)
> >> +  sub     a2, a2, t3
> >> +  bnez    a2, .Lbackward_loop
> >> +  j       .Ldone_move
> >
> > `ret` rather than `j       .Ldone_move` here, ret and j are both one
> > instruction, so let just return to save one more jump :)
> 
> Absolutely.I'll change in the next revision.
> 
> Thank you very much for the detailed and thoughtful feedback. I really appreciate
> your guidance. I'll post v2 of the patch shortly.
> 
> Best regards,
> Pincheng Wang
  
Pincheng Wang Dec. 16, 2025, 1:22 p.m. UTC | #4
Hi Christian,

I think the answer depends quite a bit on the current state of the 
RISC-V toolchain.

For RVV intrinsics specifically, a pratical consideration is compiler 
availability and compatibility. RVV intrinsics based on the v0.11 spec 
are supported by Clang 16 and GCC 13, while support for RVV v1.0 
intrinsics only became available starting with Clang 19 and GCC 14. From 
that perspective, using hand-written assembly avoids introducing a hard 
dependency on newer compiler versions.

That said, intrinsics are clearly preferable from a maintainability and 
readability standpoint, and are likely the better long-term direction 
once RVV intrinsics are consistently available across toolchains.

Given these considerations, I'd be interested to hear the community's 
thoughts on the preferred direction here. If there is interest, I would 
be happy to also come up an intrinsics-based implementation for comparison.

Best regards,
Pincheng Wang

On 2025/12/16 17:36, Christian Herber (OSS) wrote:
> Hi Pincheng,
> 
> is there a clear advantage for using assembly over using intrinsics?
> 
> Christian
> 
>> -----Original Message-----
>> From: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
>> Sent: Tuesday, 16 December 2025 03:20
>> To: Kito Cheng <kito.cheng@gmail.com>
>> Cc: newlib@sourceware.org
>> Subject: Re: [PATCH 1/1] riscv: add vectorized memset, memcpy and memmove
>>
>> Hi Kito,
>>
>> Sorry for the late reply.
>>
>> On 2025/12/12 16:33, Kito Cheng wrote:
>>>> diff --git a/newlib/libc/machine/riscv/memcpy-asm.S
>>>> b/newlib/libc/machine/riscv/memcpy-asm.S
>>>> index 2771285f9..9d1d2d4bd 100644
>>>> --- a/newlib/libc/machine/riscv/memcpy-asm.S
>>>> +++ b/newlib/libc/machine/riscv/memcpy-asm.S
>>>> @@ -9,11 +9,11 @@
>>>>       http://www.opensource.org/licenses.
>>>>    */
>>>>
>>>> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>>>>    .text
>>>>    .global memcpy
>>>>    .type  memcpy, @function
>>>>    memcpy:
>>>> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>>>
>>> This seems not right change to me, memcpy-asm.S is NOT conditional
>>> compile in the Makefile, so that mean if we didn't defined
>>> PREFER_SIZE_OVER_SPEED, __OPTIMIZE_SIZE__ or __riscv_v, we will have a
>>> empty memcpy in memcpy-asm.o and then this will included in libc.a
>>>
>>> Same issue for memmove-asm.S
>>>
>>
>> My apologies for not implementing and testing the changes thoroughly enough.
>> I'll move the guard back to its original position in the next revision.
>>
>>>>      mv a3, a0
>>>>      beqz a2, 2f
>>>>
>>>> @@ -29,4 +29,25 @@ memcpy:
>>>>      ret
>>>>
>>>>      .size        memcpy, .-memcpy
>>>> +#elif defined(__riscv_v)
>>>
>>> Suggest use __riscv_vector rather than __riscv_v, so that we can also
>>> use that logic for zve* extensions.
>>>
>>>> +  .option push
>>>> +  .option arch, +v
>>>
>>> and arch, +zve32x here rather than +v
>>>
>>
>> Will replace macro and arch,+v to support both full V and Zve* extensions.
>>
>>>> +  mv      t0, a0                    /* running dst */
>>>> +  mv      t1, a1                    /* running src */
>>>> +  beqz    a2, .Ldone_copy           /* n == 0 then return */
>>>> +
>>>> +.Lbulk_copy:
>>>> +  vsetvli t2, a2, e8, m8, ta, ma    /* t2 = vl (bytes) */
>>>> +  vle8.v  v0, (t1)
>>>> +  vse8.v  v0, (t0)
>>>> +  add     t0, t0, t2
>>>> +  add     t1, t1, t2
>>>> +  sub     a2, a2, t2
>>>
>>> This sub can be drop
>>>
>>>> +  bnez    a2, .Lbulk_copy
>>>
>>> You can use either src(a1)+len(a2) or dst(a0)+len(a2) for loop condition:
>>>
>>> something like:
>>>
>>> void *
>>> memcpy(unsigned char *dst, const unsigned char *src,
>>>                          const size_t sz) {
>>>    const unsigned char *end = dst + sz;
>>>    while (dst != end)
>>>      *dst++ = *src++;
>>>     return dst;
>>> }
>>>
>>> This optimization could be applied on other function as well
>>>
>>
>> Thanks for the suggestion. Will restructure the loop conditions as you suggested.
>>
>>>> +  /* fallthrough */
>>>> +
>>>> +.Ldone_copy:
>>>> +  ret
>>>> +.size memcpy, .-memcpy
>>>> +.option pop
>>>>    #endif
>>>> diff --git a/newlib/libc/machine/riscv/memcpy.c
>>>> b/newlib/libc/machine/riscv/memcpy.c
>>>> index a27e0ecb1..cd58c30a5 100644
>>>> --- a/newlib/libc/machine/riscv/memcpy.c
>>>> +++ b/newlib/libc/machine/riscv/memcpy.c
>>>> @@ -10,7 +10,7 @@
>>>>       http://www.opensource.org/licenses.
>>>>    */
>>>>
>>>> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>>>> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) ||
>>>> +defined(__riscv_v)
>>>>    // memcpy defined in memcpy-asm.S
>>>>    #else
>>>>
>>>> diff --git a/newlib/libc/machine/riscv/memmove-asm.S
>>>> b/newlib/libc/machine/riscv/memmove-asm.S
>>>> index 061472ca2..5cc2e5143 100644
>>>> --- a/newlib/libc/machine/riscv/memmove-asm.S
>>>> +++ b/newlib/libc/machine/riscv/memmove-asm.S
>>>> @@ -9,11 +9,11 @@
>>>>       http://www.opensource.org/licenses.
>>>>    */
>>>>
>>>> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>>>>    .text
>>>>    .global memmove
>>>>    .type  memmove, @function
>>>>    memmove:
>>>> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>>>>      beqz a2, .Ldone              /* in case there are 0 bytes to be copied, return
>> immediately */
>>>>
>>>>      mv a4, a0                    /* copy the destination address over to a4, since
>> memmove should return that address in a0 at the end */
>>>> @@ -37,4 +37,49 @@ memmove:
>>>>      ret
>>>>
>>>>      .size        memmove, .-memmove
>>>> +#elif defined(__riscv_v)
>>>> +  .option push
>>>> +  .option arch, +v
>>>> +  beqz    a2, .Ldone_move           /* n == 0 */
>>>> +  beq     a0, a1, .Ldone_move       /* dst == src */
>>>> +
>>>> +  /* overlap check */
>>>> +  bgeu    a1, a0, .Lforward_move    /* src >= dst then forward move*/
>>>> +
>>>> +  sub     t2, a0, a1                /* t2 = dst - src */
>>>> +  bgeu    t2, a2, .Lforward_move    /* no overlap then forward move */
>>>> +
>>>> +  /* backward move */
>>>> +  add     t0, a0, a2                /* running dst_end */
>>>> +  add     t1, a1, a2                /* running src_end */
>>>> +
>>>> +.Lbackward_loop:
>>>> +  vsetvli t3, a2, e8, m8, ta, ma    /* t3 = vl (bytes) */
>>>> +  sub     t0, t0, t3
>>>> +  sub     t1, t1, t3
>>>> +  vle8.v  v0, (t1)
>>>> +  vse8.v  v0, (t0)
>>>> +  sub     a2, a2, t3
>>>> +  bnez    a2, .Lbackward_loop
>>>> +  j       .Ldone_move
>>>
>>> `ret` rather than `j       .Ldone_move` here, ret and j are both one
>>> instruction, so let just return to save one more jump :)
>>
>> Absolutely.I'll change in the next revision.
>>
>> Thank you very much for the detailed and thoughtful feedback. I really appreciate
>> your guidance. I'll post v2 of the patch shortly.
>>
>> Best regards,
>> Pincheng Wang
>
  

Patch

diff --git a/newlib/libc/machine/riscv/memcpy-asm.S b/newlib/libc/machine/riscv/memcpy-asm.S
index 2771285f9..9d1d2d4bd 100644
--- a/newlib/libc/machine/riscv/memcpy-asm.S
+++ b/newlib/libc/machine/riscv/memcpy-asm.S
@@ -9,11 +9,11 @@ 
    http://www.opensource.org/licenses.
 */
 
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
 .text
 .global memcpy
 .type	memcpy, @function
 memcpy:
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
   mv a3, a0
   beqz a2, 2f
 
@@ -29,4 +29,25 @@  memcpy:
   ret
 
   .size	memcpy, .-memcpy
+#elif defined(__riscv_v)
+  .option push
+  .option arch, +v
+  mv      t0, a0                    /* running dst */
+  mv      t1, a1                    /* running src */
+  beqz    a2, .Ldone_copy           /* n == 0 then return */
+
+.Lbulk_copy:
+  vsetvli t2, a2, e8, m8, ta, ma    /* t2 = vl (bytes) */
+  vle8.v  v0, (t1)
+  vse8.v  v0, (t0)
+  add     t0, t0, t2
+  add     t1, t1, t2
+  sub     a2, a2, t2
+  bnez    a2, .Lbulk_copy
+  /* fallthrough */
+
+.Ldone_copy:
+  ret
+.size memcpy, .-memcpy
+.option pop
 #endif
diff --git a/newlib/libc/machine/riscv/memcpy.c b/newlib/libc/machine/riscv/memcpy.c
index a27e0ecb1..cd58c30a5 100644
--- a/newlib/libc/machine/riscv/memcpy.c
+++ b/newlib/libc/machine/riscv/memcpy.c
@@ -10,7 +10,7 @@ 
    http://www.opensource.org/licenses.
 */
 
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) || defined(__riscv_v)
 // memcpy defined in memcpy-asm.S
 #else
 
diff --git a/newlib/libc/machine/riscv/memmove-asm.S b/newlib/libc/machine/riscv/memmove-asm.S
index 061472ca2..5cc2e5143 100644
--- a/newlib/libc/machine/riscv/memmove-asm.S
+++ b/newlib/libc/machine/riscv/memmove-asm.S
@@ -9,11 +9,11 @@ 
    http://www.opensource.org/licenses.
 */
 
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
 .text
 .global memmove
 .type	memmove, @function
 memmove:
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
   beqz a2, .Ldone		/* in case there are 0 bytes to be copied, return immediately */
 
   mv a4, a0			/* copy the destination address over to a4, since memmove should return that address in a0 at the end */
@@ -37,4 +37,49 @@  memmove:
   ret
 
   .size	memmove, .-memmove
+#elif defined(__riscv_v)
+  .option push
+  .option arch, +v
+  beqz    a2, .Ldone_move           /* n == 0 */
+  beq     a0, a1, .Ldone_move       /* dst == src */
+
+  /* overlap check */
+  bgeu    a1, a0, .Lforward_move    /* src >= dst then forward move*/
+
+  sub     t2, a0, a1                /* t2 = dst - src */
+  bgeu    t2, a2, .Lforward_move    /* no overlap then forward move */
+
+  /* backward move */
+  add     t0, a0, a2                /* running dst_end */
+  add     t1, a1, a2                /* running src_end */
+
+.Lbackward_loop:
+  vsetvli t3, a2, e8, m8, ta, ma    /* t3 = vl (bytes) */
+  sub     t0, t0, t3
+  sub     t1, t1, t3
+  vle8.v  v0, (t1)
+  vse8.v  v0, (t0)
+  sub     a2, a2, t3
+  bnez    a2, .Lbackward_loop
+  j       .Ldone_move
+
+/* forward move, same as memcpy */
+.Lforward_move:
+  mv      t0, a0                    /* running dst */
+  mv      t1, a1                    /* running src */
+
+.Lforward_loop:
+  vsetvli t3, a2, e8, m8, ta, ma
+  vle8.v  v0, (t1)
+  vse8.v  v0, (t0)
+  add     t0, t0, t3
+  add     t1, t1, t3
+  sub     a2, a2, t3
+  bnez    a2, .Lforward_loop
+  /* fallthrough */
+
+.Ldone_move:
+  ret
+.size memmove, .-memmove
+.option pop
 #endif
diff --git a/newlib/libc/machine/riscv/memmove.c b/newlib/libc/machine/riscv/memmove.c
index 209a75c69..67ce08b02 100644
--- a/newlib/libc/machine/riscv/memmove.c
+++ b/newlib/libc/machine/riscv/memmove.c
@@ -10,7 +10,7 @@ 
    http://www.opensource.org/licenses.
 */
 
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) || defined(__riscv_v)
 /* memmove defined in memmove-asm.S */
 #else
 
diff --git a/newlib/libc/machine/riscv/memset.S b/newlib/libc/machine/riscv/memset.S
index 533f66758..80f43fbaf 100644
--- a/newlib/libc/machine/riscv/memset.S
+++ b/newlib/libc/machine/riscv/memset.S
@@ -63,6 +63,28 @@  memset:
 .Ldone:
   ret
 
+#elif defined(__riscv_v)
+  .option push
+  .option arch, +v
+  mv      t0, a0                    /* running dst; keep a0 as return */
+  beqz    a2, .Ldone_vect           /* n == 0 then return */
+
+  /* Broadcast fill byte once. */
+  vsetvli t1, zero, e8, m8, ta, ma
+  vmv.v.x v0, a1
+
+.Lbulk_vect:
+  vsetvli t1, a2, e8, m8, ta, ma    /* t1 = vl (bytes) */
+  vse8.v  v0, (t0)
+  add     t0, t0, t1
+  sub     a2, a2, t1
+  bnez    a2, .Lbulk_vect
+  /* fallthrough */
+
+.Ldone_vect:
+  ret
+  .option pop
+
 #else
   li     REG_TABLE, BYTE_TBL_SZ
   mv     a3, a0