[1/1] riscv: add vectorized memset, memcpy and memmove
Commit Message
The vector implementations use m8 register grouping and process data in
vector-length chunks, providing significant performance improvements on
RVV-capable hardware. Use conditional compilation to fallback to scalar
implementation when __riscv_v is not available, maintaining
compatibility with non-vector RISC-V systems.
Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
---
newlib/libc/machine/riscv/memcpy-asm.S | 23 +++++++++++-
newlib/libc/machine/riscv/memcpy.c | 2 +-
newlib/libc/machine/riscv/memmove-asm.S | 47 ++++++++++++++++++++++++-
newlib/libc/machine/riscv/memmove.c | 2 +-
newlib/libc/machine/riscv/memset.S | 22 ++++++++++++
5 files changed, 92 insertions(+), 4 deletions(-)
Comments
> diff --git a/newlib/libc/machine/riscv/memcpy-asm.S b/newlib/libc/machine/riscv/memcpy-asm.S
> index 2771285f9..9d1d2d4bd 100644
> --- a/newlib/libc/machine/riscv/memcpy-asm.S
> +++ b/newlib/libc/machine/riscv/memcpy-asm.S
> @@ -9,11 +9,11 @@
> http://www.opensource.org/licenses.
> */
>
> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> .text
> .global memcpy
> .type memcpy, @function
> memcpy:
> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
This seems not right change to me, memcpy-asm.S is NOT conditional
compile in the Makefile, so that mean if we didn't defined
PREFER_SIZE_OVER_SPEED, __OPTIMIZE_SIZE__ or __riscv_v, we will have a
empty memcpy in memcpy-asm.o and then this will included in libc.a
Same issue for memmove-asm.S
> mv a3, a0
> beqz a2, 2f
>
> @@ -29,4 +29,25 @@ memcpy:
> ret
>
> .size memcpy, .-memcpy
> +#elif defined(__riscv_v)
Suggest use __riscv_vector rather than __riscv_v, so that we can also
use that logic for zve* extensions.
> + .option push
> + .option arch, +v
and arch, +zve32x here rather than +v
> + mv t0, a0 /* running dst */
> + mv t1, a1 /* running src */
> + beqz a2, .Ldone_copy /* n == 0 then return */
> +
> +.Lbulk_copy:
> + vsetvli t2, a2, e8, m8, ta, ma /* t2 = vl (bytes) */
> + vle8.v v0, (t1)
> + vse8.v v0, (t0)
> + add t0, t0, t2
> + add t1, t1, t2
> + sub a2, a2, t2
This sub can be drop
> + bnez a2, .Lbulk_copy
You can use either src(a1)+len(a2) or dst(a0)+len(a2) for loop condition:
something like:
void *
memcpy(unsigned char *dst, const unsigned char *src,
const size_t sz)
{
const unsigned char *end = dst + sz;
while (dst != end)
*dst++ = *src++;
return dst;
}
This optimization could be applied on other function as well
> + /* fallthrough */
> +
> +.Ldone_copy:
> + ret
> +.size memcpy, .-memcpy
> +.option pop
> #endif
> diff --git a/newlib/libc/machine/riscv/memcpy.c b/newlib/libc/machine/riscv/memcpy.c
> index a27e0ecb1..cd58c30a5 100644
> --- a/newlib/libc/machine/riscv/memcpy.c
> +++ b/newlib/libc/machine/riscv/memcpy.c
> @@ -10,7 +10,7 @@
> http://www.opensource.org/licenses.
> */
>
> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) || defined(__riscv_v)
> // memcpy defined in memcpy-asm.S
> #else
>
> diff --git a/newlib/libc/machine/riscv/memmove-asm.S b/newlib/libc/machine/riscv/memmove-asm.S
> index 061472ca2..5cc2e5143 100644
> --- a/newlib/libc/machine/riscv/memmove-asm.S
> +++ b/newlib/libc/machine/riscv/memmove-asm.S
> @@ -9,11 +9,11 @@
> http://www.opensource.org/licenses.
> */
>
> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> .text
> .global memmove
> .type memmove, @function
> memmove:
> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> beqz a2, .Ldone /* in case there are 0 bytes to be copied, return immediately */
>
> mv a4, a0 /* copy the destination address over to a4, since memmove should return that address in a0 at the end */
> @@ -37,4 +37,49 @@ memmove:
> ret
>
> .size memmove, .-memmove
> +#elif defined(__riscv_v)
> + .option push
> + .option arch, +v
> + beqz a2, .Ldone_move /* n == 0 */
> + beq a0, a1, .Ldone_move /* dst == src */
> +
> + /* overlap check */
> + bgeu a1, a0, .Lforward_move /* src >= dst then forward move*/
> +
> + sub t2, a0, a1 /* t2 = dst - src */
> + bgeu t2, a2, .Lforward_move /* no overlap then forward move */
> +
> + /* backward move */
> + add t0, a0, a2 /* running dst_end */
> + add t1, a1, a2 /* running src_end */
> +
> +.Lbackward_loop:
> + vsetvli t3, a2, e8, m8, ta, ma /* t3 = vl (bytes) */
> + sub t0, t0, t3
> + sub t1, t1, t3
> + vle8.v v0, (t1)
> + vse8.v v0, (t0)
> + sub a2, a2, t3
> + bnez a2, .Lbackward_loop
> + j .Ldone_move
`ret` rather than `j .Ldone_move` here, ret and j are both one
instruction, so let just return to save one more jump :)
Hi Kito,
Sorry for the late reply.
On 2025/12/12 16:33, Kito Cheng wrote:
>> diff --git a/newlib/libc/machine/riscv/memcpy-asm.S b/newlib/libc/machine/riscv/memcpy-asm.S
>> index 2771285f9..9d1d2d4bd 100644
>> --- a/newlib/libc/machine/riscv/memcpy-asm.S
>> +++ b/newlib/libc/machine/riscv/memcpy-asm.S
>> @@ -9,11 +9,11 @@
>> http://www.opensource.org/licenses.
>> */
>>
>> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>> .text
>> .global memcpy
>> .type memcpy, @function
>> memcpy:
>> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>
> This seems not right change to me, memcpy-asm.S is NOT conditional
> compile in the Makefile, so that mean if we didn't defined
> PREFER_SIZE_OVER_SPEED, __OPTIMIZE_SIZE__ or __riscv_v, we will have a
> empty memcpy in memcpy-asm.o and then this will included in libc.a
>
> Same issue for memmove-asm.S
>
My apologies for not implementing and testing the changes thoroughly
enough. I'll move the guard back to its original position in the next
revision.
>> mv a3, a0
>> beqz a2, 2f
>>
>> @@ -29,4 +29,25 @@ memcpy:
>> ret
>>
>> .size memcpy, .-memcpy
>> +#elif defined(__riscv_v)
>
> Suggest use __riscv_vector rather than __riscv_v, so that we can also
> use that logic for zve* extensions.
>
>> + .option push
>> + .option arch, +v
>
> and arch, +zve32x here rather than +v
>
Will replace macro and arch,+v to support both full V and Zve* extensions.
>> + mv t0, a0 /* running dst */
>> + mv t1, a1 /* running src */
>> + beqz a2, .Ldone_copy /* n == 0 then return */
>> +
>> +.Lbulk_copy:
>> + vsetvli t2, a2, e8, m8, ta, ma /* t2 = vl (bytes) */
>> + vle8.v v0, (t1)
>> + vse8.v v0, (t0)
>> + add t0, t0, t2
>> + add t1, t1, t2
>> + sub a2, a2, t2
>
> This sub can be drop
>
>> + bnez a2, .Lbulk_copy
>
> You can use either src(a1)+len(a2) or dst(a0)+len(a2) for loop condition:
>
> something like:
>
> void *
> memcpy(unsigned char *dst, const unsigned char *src,
> const size_t sz)
> {
> const unsigned char *end = dst + sz;
> while (dst != end)
> *dst++ = *src++;
> return dst;
> }
>
> This optimization could be applied on other function as well
>
Thanks for the suggestion. Will restructure the loop conditions as you
suggested.
>> + /* fallthrough */
>> +
>> +.Ldone_copy:
>> + ret
>> +.size memcpy, .-memcpy
>> +.option pop
>> #endif
>> diff --git a/newlib/libc/machine/riscv/memcpy.c b/newlib/libc/machine/riscv/memcpy.c
>> index a27e0ecb1..cd58c30a5 100644
>> --- a/newlib/libc/machine/riscv/memcpy.c
>> +++ b/newlib/libc/machine/riscv/memcpy.c
>> @@ -10,7 +10,7 @@
>> http://www.opensource.org/licenses.
>> */
>>
>> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) || defined(__riscv_v)
>> // memcpy defined in memcpy-asm.S
>> #else
>>
>> diff --git a/newlib/libc/machine/riscv/memmove-asm.S b/newlib/libc/machine/riscv/memmove-asm.S
>> index 061472ca2..5cc2e5143 100644
>> --- a/newlib/libc/machine/riscv/memmove-asm.S
>> +++ b/newlib/libc/machine/riscv/memmove-asm.S
>> @@ -9,11 +9,11 @@
>> http://www.opensource.org/licenses.
>> */
>>
>> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>> .text
>> .global memmove
>> .type memmove, @function
>> memmove:
>> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>> beqz a2, .Ldone /* in case there are 0 bytes to be copied, return immediately */
>>
>> mv a4, a0 /* copy the destination address over to a4, since memmove should return that address in a0 at the end */
>> @@ -37,4 +37,49 @@ memmove:
>> ret
>>
>> .size memmove, .-memmove
>> +#elif defined(__riscv_v)
>> + .option push
>> + .option arch, +v
>> + beqz a2, .Ldone_move /* n == 0 */
>> + beq a0, a1, .Ldone_move /* dst == src */
>> +
>> + /* overlap check */
>> + bgeu a1, a0, .Lforward_move /* src >= dst then forward move*/
>> +
>> + sub t2, a0, a1 /* t2 = dst - src */
>> + bgeu t2, a2, .Lforward_move /* no overlap then forward move */
>> +
>> + /* backward move */
>> + add t0, a0, a2 /* running dst_end */
>> + add t1, a1, a2 /* running src_end */
>> +
>> +.Lbackward_loop:
>> + vsetvli t3, a2, e8, m8, ta, ma /* t3 = vl (bytes) */
>> + sub t0, t0, t3
>> + sub t1, t1, t3
>> + vle8.v v0, (t1)
>> + vse8.v v0, (t0)
>> + sub a2, a2, t3
>> + bnez a2, .Lbackward_loop
>> + j .Ldone_move
>
> `ret` rather than `j .Ldone_move` here, ret and j are both one
> instruction, so let just return to save one more jump :)
Absolutely.I'll change in the next revision.
Thank you very much for the detailed and thoughtful feedback. I really
appreciate your guidance. I'll post v2 of the patch shortly.
Best regards,
Pincheng Wang
Hi Pincheng,
is there a clear advantage for using assembly over using intrinsics?
Christian
> -----Original Message-----
> From: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
> Sent: Tuesday, 16 December 2025 03:20
> To: Kito Cheng <kito.cheng@gmail.com>
> Cc: newlib@sourceware.org
> Subject: Re: [PATCH 1/1] riscv: add vectorized memset, memcpy and memmove
>
> Hi Kito,
>
> Sorry for the late reply.
>
> On 2025/12/12 16:33, Kito Cheng wrote:
> >> diff --git a/newlib/libc/machine/riscv/memcpy-asm.S
> >> b/newlib/libc/machine/riscv/memcpy-asm.S
> >> index 2771285f9..9d1d2d4bd 100644
> >> --- a/newlib/libc/machine/riscv/memcpy-asm.S
> >> +++ b/newlib/libc/machine/riscv/memcpy-asm.S
> >> @@ -9,11 +9,11 @@
> >> http://www.opensource.org/licenses.
> >> */
> >>
> >> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> >> .text
> >> .global memcpy
> >> .type memcpy, @function
> >> memcpy:
> >> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> >
> > This seems not right change to me, memcpy-asm.S is NOT conditional
> > compile in the Makefile, so that mean if we didn't defined
> > PREFER_SIZE_OVER_SPEED, __OPTIMIZE_SIZE__ or __riscv_v, we will have a
> > empty memcpy in memcpy-asm.o and then this will included in libc.a
> >
> > Same issue for memmove-asm.S
> >
>
> My apologies for not implementing and testing the changes thoroughly enough.
> I'll move the guard back to its original position in the next revision.
>
> >> mv a3, a0
> >> beqz a2, 2f
> >>
> >> @@ -29,4 +29,25 @@ memcpy:
> >> ret
> >>
> >> .size memcpy, .-memcpy
> >> +#elif defined(__riscv_v)
> >
> > Suggest use __riscv_vector rather than __riscv_v, so that we can also
> > use that logic for zve* extensions.
> >
> >> + .option push
> >> + .option arch, +v
> >
> > and arch, +zve32x here rather than +v
> >
>
> Will replace macro and arch,+v to support both full V and Zve* extensions.
>
> >> + mv t0, a0 /* running dst */
> >> + mv t1, a1 /* running src */
> >> + beqz a2, .Ldone_copy /* n == 0 then return */
> >> +
> >> +.Lbulk_copy:
> >> + vsetvli t2, a2, e8, m8, ta, ma /* t2 = vl (bytes) */
> >> + vle8.v v0, (t1)
> >> + vse8.v v0, (t0)
> >> + add t0, t0, t2
> >> + add t1, t1, t2
> >> + sub a2, a2, t2
> >
> > This sub can be drop
> >
> >> + bnez a2, .Lbulk_copy
> >
> > You can use either src(a1)+len(a2) or dst(a0)+len(a2) for loop condition:
> >
> > something like:
> >
> > void *
> > memcpy(unsigned char *dst, const unsigned char *src,
> > const size_t sz) {
> > const unsigned char *end = dst + sz;
> > while (dst != end)
> > *dst++ = *src++;
> > return dst;
> > }
> >
> > This optimization could be applied on other function as well
> >
>
> Thanks for the suggestion. Will restructure the loop conditions as you suggested.
>
> >> + /* fallthrough */
> >> +
> >> +.Ldone_copy:
> >> + ret
> >> +.size memcpy, .-memcpy
> >> +.option pop
> >> #endif
> >> diff --git a/newlib/libc/machine/riscv/memcpy.c
> >> b/newlib/libc/machine/riscv/memcpy.c
> >> index a27e0ecb1..cd58c30a5 100644
> >> --- a/newlib/libc/machine/riscv/memcpy.c
> >> +++ b/newlib/libc/machine/riscv/memcpy.c
> >> @@ -10,7 +10,7 @@
> >> http://www.opensource.org/licenses.
> >> */
> >>
> >> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> >> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) ||
> >> +defined(__riscv_v)
> >> // memcpy defined in memcpy-asm.S
> >> #else
> >>
> >> diff --git a/newlib/libc/machine/riscv/memmove-asm.S
> >> b/newlib/libc/machine/riscv/memmove-asm.S
> >> index 061472ca2..5cc2e5143 100644
> >> --- a/newlib/libc/machine/riscv/memmove-asm.S
> >> +++ b/newlib/libc/machine/riscv/memmove-asm.S
> >> @@ -9,11 +9,11 @@
> >> http://www.opensource.org/licenses.
> >> */
> >>
> >> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> >> .text
> >> .global memmove
> >> .type memmove, @function
> >> memmove:
> >> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> >> beqz a2, .Ldone /* in case there are 0 bytes to be copied, return
> immediately */
> >>
> >> mv a4, a0 /* copy the destination address over to a4, since
> memmove should return that address in a0 at the end */
> >> @@ -37,4 +37,49 @@ memmove:
> >> ret
> >>
> >> .size memmove, .-memmove
> >> +#elif defined(__riscv_v)
> >> + .option push
> >> + .option arch, +v
> >> + beqz a2, .Ldone_move /* n == 0 */
> >> + beq a0, a1, .Ldone_move /* dst == src */
> >> +
> >> + /* overlap check */
> >> + bgeu a1, a0, .Lforward_move /* src >= dst then forward move*/
> >> +
> >> + sub t2, a0, a1 /* t2 = dst - src */
> >> + bgeu t2, a2, .Lforward_move /* no overlap then forward move */
> >> +
> >> + /* backward move */
> >> + add t0, a0, a2 /* running dst_end */
> >> + add t1, a1, a2 /* running src_end */
> >> +
> >> +.Lbackward_loop:
> >> + vsetvli t3, a2, e8, m8, ta, ma /* t3 = vl (bytes) */
> >> + sub t0, t0, t3
> >> + sub t1, t1, t3
> >> + vle8.v v0, (t1)
> >> + vse8.v v0, (t0)
> >> + sub a2, a2, t3
> >> + bnez a2, .Lbackward_loop
> >> + j .Ldone_move
> >
> > `ret` rather than `j .Ldone_move` here, ret and j are both one
> > instruction, so let just return to save one more jump :)
>
> Absolutely.I'll change in the next revision.
>
> Thank you very much for the detailed and thoughtful feedback. I really appreciate
> your guidance. I'll post v2 of the patch shortly.
>
> Best regards,
> Pincheng Wang
Hi Christian,
I think the answer depends quite a bit on the current state of the
RISC-V toolchain.
For RVV intrinsics specifically, a pratical consideration is compiler
availability and compatibility. RVV intrinsics based on the v0.11 spec
are supported by Clang 16 and GCC 13, while support for RVV v1.0
intrinsics only became available starting with Clang 19 and GCC 14. From
that perspective, using hand-written assembly avoids introducing a hard
dependency on newer compiler versions.
That said, intrinsics are clearly preferable from a maintainability and
readability standpoint, and are likely the better long-term direction
once RVV intrinsics are consistently available across toolchains.
Given these considerations, I'd be interested to hear the community's
thoughts on the preferred direction here. If there is interest, I would
be happy to also come up an intrinsics-based implementation for comparison.
Best regards,
Pincheng Wang
On 2025/12/16 17:36, Christian Herber (OSS) wrote:
> Hi Pincheng,
>
> is there a clear advantage for using assembly over using intrinsics?
>
> Christian
>
>> -----Original Message-----
>> From: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
>> Sent: Tuesday, 16 December 2025 03:20
>> To: Kito Cheng <kito.cheng@gmail.com>
>> Cc: newlib@sourceware.org
>> Subject: Re: [PATCH 1/1] riscv: add vectorized memset, memcpy and memmove
>>
>> Hi Kito,
>>
>> Sorry for the late reply.
>>
>> On 2025/12/12 16:33, Kito Cheng wrote:
>>>> diff --git a/newlib/libc/machine/riscv/memcpy-asm.S
>>>> b/newlib/libc/machine/riscv/memcpy-asm.S
>>>> index 2771285f9..9d1d2d4bd 100644
>>>> --- a/newlib/libc/machine/riscv/memcpy-asm.S
>>>> +++ b/newlib/libc/machine/riscv/memcpy-asm.S
>>>> @@ -9,11 +9,11 @@
>>>> http://www.opensource.org/licenses.
>>>> */
>>>>
>>>> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>>>> .text
>>>> .global memcpy
>>>> .type memcpy, @function
>>>> memcpy:
>>>> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>>>
>>> This seems not right change to me, memcpy-asm.S is NOT conditional
>>> compile in the Makefile, so that mean if we didn't defined
>>> PREFER_SIZE_OVER_SPEED, __OPTIMIZE_SIZE__ or __riscv_v, we will have a
>>> empty memcpy in memcpy-asm.o and then this will included in libc.a
>>>
>>> Same issue for memmove-asm.S
>>>
>>
>> My apologies for not implementing and testing the changes thoroughly enough.
>> I'll move the guard back to its original position in the next revision.
>>
>>>> mv a3, a0
>>>> beqz a2, 2f
>>>>
>>>> @@ -29,4 +29,25 @@ memcpy:
>>>> ret
>>>>
>>>> .size memcpy, .-memcpy
>>>> +#elif defined(__riscv_v)
>>>
>>> Suggest use __riscv_vector rather than __riscv_v, so that we can also
>>> use that logic for zve* extensions.
>>>
>>>> + .option push
>>>> + .option arch, +v
>>>
>>> and arch, +zve32x here rather than +v
>>>
>>
>> Will replace macro and arch,+v to support both full V and Zve* extensions.
>>
>>>> + mv t0, a0 /* running dst */
>>>> + mv t1, a1 /* running src */
>>>> + beqz a2, .Ldone_copy /* n == 0 then return */
>>>> +
>>>> +.Lbulk_copy:
>>>> + vsetvli t2, a2, e8, m8, ta, ma /* t2 = vl (bytes) */
>>>> + vle8.v v0, (t1)
>>>> + vse8.v v0, (t0)
>>>> + add t0, t0, t2
>>>> + add t1, t1, t2
>>>> + sub a2, a2, t2
>>>
>>> This sub can be drop
>>>
>>>> + bnez a2, .Lbulk_copy
>>>
>>> You can use either src(a1)+len(a2) or dst(a0)+len(a2) for loop condition:
>>>
>>> something like:
>>>
>>> void *
>>> memcpy(unsigned char *dst, const unsigned char *src,
>>> const size_t sz) {
>>> const unsigned char *end = dst + sz;
>>> while (dst != end)
>>> *dst++ = *src++;
>>> return dst;
>>> }
>>>
>>> This optimization could be applied on other function as well
>>>
>>
>> Thanks for the suggestion. Will restructure the loop conditions as you suggested.
>>
>>>> + /* fallthrough */
>>>> +
>>>> +.Ldone_copy:
>>>> + ret
>>>> +.size memcpy, .-memcpy
>>>> +.option pop
>>>> #endif
>>>> diff --git a/newlib/libc/machine/riscv/memcpy.c
>>>> b/newlib/libc/machine/riscv/memcpy.c
>>>> index a27e0ecb1..cd58c30a5 100644
>>>> --- a/newlib/libc/machine/riscv/memcpy.c
>>>> +++ b/newlib/libc/machine/riscv/memcpy.c
>>>> @@ -10,7 +10,7 @@
>>>> http://www.opensource.org/licenses.
>>>> */
>>>>
>>>> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>>>> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) ||
>>>> +defined(__riscv_v)
>>>> // memcpy defined in memcpy-asm.S
>>>> #else
>>>>
>>>> diff --git a/newlib/libc/machine/riscv/memmove-asm.S
>>>> b/newlib/libc/machine/riscv/memmove-asm.S
>>>> index 061472ca2..5cc2e5143 100644
>>>> --- a/newlib/libc/machine/riscv/memmove-asm.S
>>>> +++ b/newlib/libc/machine/riscv/memmove-asm.S
>>>> @@ -9,11 +9,11 @@
>>>> http://www.opensource.org/licenses.
>>>> */
>>>>
>>>> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>>>> .text
>>>> .global memmove
>>>> .type memmove, @function
>>>> memmove:
>>>> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>>>> beqz a2, .Ldone /* in case there are 0 bytes to be copied, return
>> immediately */
>>>>
>>>> mv a4, a0 /* copy the destination address over to a4, since
>> memmove should return that address in a0 at the end */
>>>> @@ -37,4 +37,49 @@ memmove:
>>>> ret
>>>>
>>>> .size memmove, .-memmove
>>>> +#elif defined(__riscv_v)
>>>> + .option push
>>>> + .option arch, +v
>>>> + beqz a2, .Ldone_move /* n == 0 */
>>>> + beq a0, a1, .Ldone_move /* dst == src */
>>>> +
>>>> + /* overlap check */
>>>> + bgeu a1, a0, .Lforward_move /* src >= dst then forward move*/
>>>> +
>>>> + sub t2, a0, a1 /* t2 = dst - src */
>>>> + bgeu t2, a2, .Lforward_move /* no overlap then forward move */
>>>> +
>>>> + /* backward move */
>>>> + add t0, a0, a2 /* running dst_end */
>>>> + add t1, a1, a2 /* running src_end */
>>>> +
>>>> +.Lbackward_loop:
>>>> + vsetvli t3, a2, e8, m8, ta, ma /* t3 = vl (bytes) */
>>>> + sub t0, t0, t3
>>>> + sub t1, t1, t3
>>>> + vle8.v v0, (t1)
>>>> + vse8.v v0, (t0)
>>>> + sub a2, a2, t3
>>>> + bnez a2, .Lbackward_loop
>>>> + j .Ldone_move
>>>
>>> `ret` rather than `j .Ldone_move` here, ret and j are both one
>>> instruction, so let just return to save one more jump :)
>>
>> Absolutely.I'll change in the next revision.
>>
>> Thank you very much for the detailed and thoughtful feedback. I really appreciate
>> your guidance. I'll post v2 of the patch shortly.
>>
>> Best regards,
>> Pincheng Wang
>
@@ -9,11 +9,11 @@
http://www.opensource.org/licenses.
*/
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
.text
.global memcpy
.type memcpy, @function
memcpy:
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
mv a3, a0
beqz a2, 2f
@@ -29,4 +29,25 @@ memcpy:
ret
.size memcpy, .-memcpy
+#elif defined(__riscv_v)
+ .option push
+ .option arch, +v
+ mv t0, a0 /* running dst */
+ mv t1, a1 /* running src */
+ beqz a2, .Ldone_copy /* n == 0 then return */
+
+.Lbulk_copy:
+ vsetvli t2, a2, e8, m8, ta, ma /* t2 = vl (bytes) */
+ vle8.v v0, (t1)
+ vse8.v v0, (t0)
+ add t0, t0, t2
+ add t1, t1, t2
+ sub a2, a2, t2
+ bnez a2, .Lbulk_copy
+ /* fallthrough */
+
+.Ldone_copy:
+ ret
+.size memcpy, .-memcpy
+.option pop
#endif
@@ -10,7 +10,7 @@
http://www.opensource.org/licenses.
*/
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) || defined(__riscv_v)
// memcpy defined in memcpy-asm.S
#else
@@ -9,11 +9,11 @@
http://www.opensource.org/licenses.
*/
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
.text
.global memmove
.type memmove, @function
memmove:
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
beqz a2, .Ldone /* in case there are 0 bytes to be copied, return immediately */
mv a4, a0 /* copy the destination address over to a4, since memmove should return that address in a0 at the end */
@@ -37,4 +37,49 @@ memmove:
ret
.size memmove, .-memmove
+#elif defined(__riscv_v)
+ .option push
+ .option arch, +v
+ beqz a2, .Ldone_move /* n == 0 */
+ beq a0, a1, .Ldone_move /* dst == src */
+
+ /* overlap check */
+ bgeu a1, a0, .Lforward_move /* src >= dst then forward move*/
+
+ sub t2, a0, a1 /* t2 = dst - src */
+ bgeu t2, a2, .Lforward_move /* no overlap then forward move */
+
+ /* backward move */
+ add t0, a0, a2 /* running dst_end */
+ add t1, a1, a2 /* running src_end */
+
+.Lbackward_loop:
+ vsetvli t3, a2, e8, m8, ta, ma /* t3 = vl (bytes) */
+ sub t0, t0, t3
+ sub t1, t1, t3
+ vle8.v v0, (t1)
+ vse8.v v0, (t0)
+ sub a2, a2, t3
+ bnez a2, .Lbackward_loop
+ j .Ldone_move
+
+/* forward move, same as memcpy */
+.Lforward_move:
+ mv t0, a0 /* running dst */
+ mv t1, a1 /* running src */
+
+.Lforward_loop:
+ vsetvli t3, a2, e8, m8, ta, ma
+ vle8.v v0, (t1)
+ vse8.v v0, (t0)
+ add t0, t0, t3
+ add t1, t1, t3
+ sub a2, a2, t3
+ bnez a2, .Lforward_loop
+ /* fallthrough */
+
+.Ldone_move:
+ ret
+.size memmove, .-memmove
+.option pop
#endif
@@ -10,7 +10,7 @@
http://www.opensource.org/licenses.
*/
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) || defined(__riscv_v)
/* memmove defined in memmove-asm.S */
#else
@@ -63,6 +63,28 @@ memset:
.Ldone:
ret
+#elif defined(__riscv_v)
+ .option push
+ .option arch, +v
+ mv t0, a0 /* running dst; keep a0 as return */
+ beqz a2, .Ldone_vect /* n == 0 then return */
+
+ /* Broadcast fill byte once. */
+ vsetvli t1, zero, e8, m8, ta, ma
+ vmv.v.x v0, a1
+
+.Lbulk_vect:
+ vsetvli t1, a2, e8, m8, ta, ma /* t1 = vl (bytes) */
+ vse8.v v0, (t0)
+ add t0, t0, t1
+ sub a2, a2, t1
+ bnez a2, .Lbulk_vect
+ /* fallthrough */
+
+.Ldone_vect:
+ ret
+ .option pop
+
#else
li REG_TABLE, BYTE_TBL_SZ
mv a3, a0