[2/2] riscv: vectorised mem* and str* functions
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
dj/TryBot-32bit |
success
|
Build for i686
|
Commit Message
Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
targeting the riscv "V" extension, version 1.0
The vectorised implementations assume VLENB of at least 128 and at least 32
registers (as mandated by the "V" extension spec). They also assume that
VLENB is a power of two which is no larger than the page size, and (as
vectorised code in glibc for other platforms does) that it is safe to read
past null terminators / buffer ends provided one does not cross a page
boundary.
Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
---
sysdeps/riscv/rv64/rvv/Implies | 2 +
sysdeps/riscv/rv64/rvv/memchr.S | 127 +++++++++++++++++++
sysdeps/riscv/rv64/rvv/memcmp.S | 93 ++++++++++++++
sysdeps/riscv/rv64/rvv/memcpy.S | 154 +++++++++++++++++++++++
sysdeps/riscv/rv64/rvv/memmove.c | 22 ++++
sysdeps/riscv/rv64/rvv/memset.S | 89 ++++++++++++++
sysdeps/riscv/rv64/rvv/strchr.S | 92 ++++++++++++++
sysdeps/riscv/rv64/rvv/strchrnul.c | 22 ++++
sysdeps/riscv/rv64/rvv/strcmp.S | 108 +++++++++++++++++
sysdeps/riscv/rv64/rvv/strcpy.S | 72 +++++++++++
sysdeps/riscv/rv64/rvv/strcspn.c | 22 ++++
sysdeps/riscv/rv64/rvv/strlen.S | 67 ++++++++++
sysdeps/riscv/rv64/rvv/strncmp.S | 104 ++++++++++++++++
sysdeps/riscv/rv64/rvv/strncpy.S | 96 +++++++++++++++
sysdeps/riscv/rv64/rvv/strnlen.S | 81 +++++++++++++
sysdeps/riscv/rv64/rvv/strrchr.S | 88 ++++++++++++++
sysdeps/riscv/rv64/rvv/strspn.S | 189 +++++++++++++++++++++++++++++
17 files changed, 1428 insertions(+)
create mode 100644 sysdeps/riscv/rv64/rvv/Implies
create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
Comments
On 2/1/23 02:52, Sergei Lewis wrote:
> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> targeting the riscv "V" extension, version 1.0
>
> The vectorised implementations assume VLENB of at least 128 and at least 32
> registers (as mandated by the "V" extension spec). They also assume that
> VLENB is a power of two which is no larger than the page size, and (as
> vectorised code in glibc for other platforms does) that it is safe to read
> past null terminators / buffer ends provided one does not cross a page
> boundary.
>
> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
> ---
> sysdeps/riscv/rv64/rvv/Implies | 2 +
> sysdeps/riscv/rv64/rvv/memchr.S | 127 +++++++++++++++++++
> sysdeps/riscv/rv64/rvv/memcmp.S | 93 ++++++++++++++
> sysdeps/riscv/rv64/rvv/memcpy.S | 154 +++++++++++++++++++++++
> sysdeps/riscv/rv64/rvv/memmove.c | 22 ++++
> sysdeps/riscv/rv64/rvv/memset.S | 89 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strchr.S | 92 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strchrnul.c | 22 ++++
> sysdeps/riscv/rv64/rvv/strcmp.S | 108 +++++++++++++++++
> sysdeps/riscv/rv64/rvv/strcpy.S | 72 +++++++++++
> sysdeps/riscv/rv64/rvv/strcspn.c | 22 ++++
> sysdeps/riscv/rv64/rvv/strlen.S | 67 ++++++++++
> sysdeps/riscv/rv64/rvv/strncmp.S | 104 ++++++++++++++++
> sysdeps/riscv/rv64/rvv/strncpy.S | 96 +++++++++++++++
> sysdeps/riscv/rv64/rvv/strnlen.S | 81 +++++++++++++
> sysdeps/riscv/rv64/rvv/strrchr.S | 88 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strspn.S | 189 +++++++++++++++++++++++++++++
Does this need to be revamped given the recent push to do more with
generic code and target specific hooks for mem* and str*?
Shouldn't the implementations be in a multiarch directory? I would
fully expect we're going to need both a vector and scalar implementation
selected by an ifunc.
I'm happy to pass along the current bits from VRULL which put that
infrastructure in place. I just haven't had the time to look at
revamping their assembly implementations for the new generic+hooks scheme.
jeff
* Jeff Law via Libc-alpha:
> On 2/1/23 02:52, Sergei Lewis wrote:
>> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
>> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
>> targeting the riscv "V" extension, version 1.0
>> The vectorised implementations assume VLENB of at least 128 and at
>> least 32
>> registers (as mandated by the "V" extension spec). They also assume that
>> VLENB is a power of two which is no larger than the page size, and (as
>> vectorised code in glibc for other platforms does) that it is safe to read
>> past null terminators / buffer ends provided one does not cross a page
>> boundary.
>> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
>> ---
>> sysdeps/riscv/rv64/rvv/Implies | 2 +
>> sysdeps/riscv/rv64/rvv/memchr.S | 127 +++++++++++++++++++
>> sysdeps/riscv/rv64/rvv/memcmp.S | 93 ++++++++++++++
>> sysdeps/riscv/rv64/rvv/memcpy.S | 154 +++++++++++++++++++++++
>> sysdeps/riscv/rv64/rvv/memmove.c | 22 ++++
>> sysdeps/riscv/rv64/rvv/memset.S | 89 ++++++++++++++
>> sysdeps/riscv/rv64/rvv/strchr.S | 92 ++++++++++++++
>> sysdeps/riscv/rv64/rvv/strchrnul.c | 22 ++++
>> sysdeps/riscv/rv64/rvv/strcmp.S | 108 +++++++++++++++++
>> sysdeps/riscv/rv64/rvv/strcpy.S | 72 +++++++++++
>> sysdeps/riscv/rv64/rvv/strcspn.c | 22 ++++
>> sysdeps/riscv/rv64/rvv/strlen.S | 67 ++++++++++
>> sysdeps/riscv/rv64/rvv/strncmp.S | 104 ++++++++++++++++
>> sysdeps/riscv/rv64/rvv/strncpy.S | 96 +++++++++++++++
>> sysdeps/riscv/rv64/rvv/strnlen.S | 81 +++++++++++++
>> sysdeps/riscv/rv64/rvv/strrchr.S | 88 ++++++++++++++
>> sysdeps/riscv/rv64/rvv/strspn.S | 189 +++++++++++++++++++++++++++++
> Does this need to be revamped given the recent push to do more with
> generic code and target specific hooks for mem* and str*?
>
> Shouldn't the implementations be in a multiarch directory? I would
> fully expect we're going to need both a vector and scalar
> implementation selected by an ifunc.
I think most RISC-V GCC compilers won't have enabled IFUNC support?
Looking at gcc/config.gcc in GCC 12, I see this:
*-*-linux* | *-*-gnu*)
case ${target} in
aarch64*-* | arm*-* | i[34567]86-* | powerpc*-* | s390*-* | sparc*-* | x86_64-* | loongarch*-*)
default_gnu_indirect_function=yes
;;
esac
But maybe that's not the right place to look at?
We have an assembler hack to be able to still build IFUNC resolvers
written in C, but I don't know if this works on RISC-V.
Ideally the GCC defaults would change, too, and well before IFUNCs are
in common use.
Thanks,
Florian
On 2/1/23 09:42, Florian Weimer wrote:
> * Jeff Law via Libc-alpha:
>
>> On 2/1/23 02:52, Sergei Lewis wrote:
>>> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
>>> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
>>> targeting the riscv "V" extension, version 1.0
>>> The vectorised implementations assume VLENB of at least 128 and at
>>> least 32
>>> registers (as mandated by the "V" extension spec). They also assume that
>>> VLENB is a power of two which is no larger than the page size, and (as
>>> vectorised code in glibc for other platforms does) that it is safe to read
>>> past null terminators / buffer ends provided one does not cross a page
>>> boundary.
>>> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
>>> ---
>>> sysdeps/riscv/rv64/rvv/Implies | 2 +
>>> sysdeps/riscv/rv64/rvv/memchr.S | 127 +++++++++++++++++++
>>> sysdeps/riscv/rv64/rvv/memcmp.S | 93 ++++++++++++++
>>> sysdeps/riscv/rv64/rvv/memcpy.S | 154 +++++++++++++++++++++++
>>> sysdeps/riscv/rv64/rvv/memmove.c | 22 ++++
>>> sysdeps/riscv/rv64/rvv/memset.S | 89 ++++++++++++++
>>> sysdeps/riscv/rv64/rvv/strchr.S | 92 ++++++++++++++
>>> sysdeps/riscv/rv64/rvv/strchrnul.c | 22 ++++
>>> sysdeps/riscv/rv64/rvv/strcmp.S | 108 +++++++++++++++++
>>> sysdeps/riscv/rv64/rvv/strcpy.S | 72 +++++++++++
>>> sysdeps/riscv/rv64/rvv/strcspn.c | 22 ++++
>>> sysdeps/riscv/rv64/rvv/strlen.S | 67 ++++++++++
>>> sysdeps/riscv/rv64/rvv/strncmp.S | 104 ++++++++++++++++
>>> sysdeps/riscv/rv64/rvv/strncpy.S | 96 +++++++++++++++
>>> sysdeps/riscv/rv64/rvv/strnlen.S | 81 +++++++++++++
>>> sysdeps/riscv/rv64/rvv/strrchr.S | 88 ++++++++++++++
>>> sysdeps/riscv/rv64/rvv/strspn.S | 189 +++++++++++++++++++++++++++++
>> Does this need to be revamped given the recent push to do more with
>> generic code and target specific hooks for mem* and str*?
>>
>> Shouldn't the implementations be in a multiarch directory? I would
>> fully expect we're going to need both a vector and scalar
>> implementation selected by an ifunc.
>
> I think most RISC-V GCC compilers won't have enabled IFUNC support?
> Looking at gcc/config.gcc in GCC 12, I see this:
>
> *-*-linux* | *-*-gnu*)
> case ${target} in
> aarch64*-* | arm*-* | i[34567]86-* | powerpc*-* | s390*-* | sparc*-* | x86_64-* | loongarch*-*)
> default_gnu_indirect_function=yes
> ;;
> esac
>
> But maybe that's not the right place to look at?
Clearly something we need to fix.
I'd hesitate to turn on the gcc bits without having the kernel/user
interface settled. There was a proposal that added a syscall to get the
processor capabilities -- I'd asked the authors to reach out to you and
Carlos on whether or not that was acceptable for glibc. I'm not sure if
that happened or not.
>
> We have an assembler hack to be able to still build IFUNC resolvers
> written in C, but I don't know if this works on RISC-V.
It probably doesn't yet.
>
> Ideally the GCC defaults would change, too, and well before IFUNCs are
> in common use.
They're not common, but I suspect that'll change in the next ~6 months.
Jeff
On 01/02/23 12:33, Jeff Law via Libc-alpha wrote:
>
>
> On 2/1/23 02:52, Sergei Lewis wrote:
>> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
>> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
>> targeting the riscv "V" extension, version 1.0
>>
>> The vectorised implementations assume VLENB of at least 128 and at least 32
>> registers (as mandated by the "V" extension spec). They also assume that
>> VLENB is a power of two which is no larger than the page size, and (as
>> vectorised code in glibc for other platforms does) that it is safe to read
>> past null terminators / buffer ends provided one does not cross a page
>> boundary.
>>
>> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
>> ---
>> sysdeps/riscv/rv64/rvv/Implies | 2 +
>> sysdeps/riscv/rv64/rvv/memchr.S | 127 +++++++++++++++++++
>> sysdeps/riscv/rv64/rvv/memcmp.S | 93 ++++++++++++++
>> sysdeps/riscv/rv64/rvv/memcpy.S | 154 +++++++++++++++++++++++
>> sysdeps/riscv/rv64/rvv/memmove.c | 22 ++++
>> sysdeps/riscv/rv64/rvv/memset.S | 89 ++++++++++++++
>> sysdeps/riscv/rv64/rvv/strchr.S | 92 ++++++++++++++
>> sysdeps/riscv/rv64/rvv/strchrnul.c | 22 ++++
>> sysdeps/riscv/rv64/rvv/strcmp.S | 108 +++++++++++++++++
>> sysdeps/riscv/rv64/rvv/strcpy.S | 72 +++++++++++
>> sysdeps/riscv/rv64/rvv/strcspn.c | 22 ++++
>> sysdeps/riscv/rv64/rvv/strlen.S | 67 ++++++++++
>> sysdeps/riscv/rv64/rvv/strncmp.S | 104 ++++++++++++++++
>> sysdeps/riscv/rv64/rvv/strncpy.S | 96 +++++++++++++++
>> sysdeps/riscv/rv64/rvv/strnlen.S | 81 +++++++++++++
>> sysdeps/riscv/rv64/rvv/strrchr.S | 88 ++++++++++++++
>> sysdeps/riscv/rv64/rvv/strspn.S | 189 +++++++++++++++++++++++++++++
> Does this need to be revamped given the recent push to do more with generic code and target specific hooks for mem* and str*?
>
I should be doable, although I think it might require some more hooks since
afaiu RISCV vector instruction does not have the concept of address update
with his vector approach (as my generic string routines does by using the
'vector' type op_t). So I am not sure it is a well fit for RISCV vec
approach.
> Shouldn't the implementations be in a multiarch directory? I would fully expect we're going to need both a vector and scalar implementation selected by an ifunc.
If so, we will need both IFUNC support on riscv (which as Florian pointed out
it is not default); and a way to discover at runtime the processor/kernel
capabilities. It does not seem that RISCV has the expected hwcap support,
the UAPI only define COMPAT_HWCAP_ISA_* that also does not seems to match
the vector or bitmanip extensions. Does it have a instruction to query for
such information, something like cpuid (x86) or midr_elX (aarch64)?
It would be interesting to add a ifunc variant for bitmanip as well.
>
> I'm happy to pass along the current bits from VRULL which put that infrastructure in place. I just haven't had the time to look at revamping their assembly implementations for the new generic+hooks scheme.
I just sent an updated version [1] where I added bitmanip optimization [2].
So now with a recent gcc (I tested with gcc 13 with upstream qemu), the
string routines should use ctz/clz/orc.b. I did not add support for
xthread [3], but it should be doable.
Once I get this upstream, I have a WIP to revamp the memmove/memcpy/memset/memcmp
as well. At least for memcmp it should use bitmanip as well.
[1] https://patchwork.sourceware.org/project/glibc/list/?series=16622
[2] https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-21-adhemerval.zanella@linaro.org/
[3] https://lore.kernel.org/all/20220906122243.1243354-1-christoph.muellner@vrull.eu/
On 01/02/23 06:52, Sergei Lewis wrote:
> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> targeting the riscv "V" extension, version 1.0
>
> The vectorised implementations assume VLENB of at least 128 and at least 32
> registers (as mandated by the "V" extension spec). They also assume that
> VLENB is a power of two which is no larger than the page size, and (as
> vectorised code in glibc for other platforms does) that it is safe to read
> past null terminators / buffer ends provided one does not cross a page
> boundary.
>
> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
Some comments that might be useful since I am working the generic implementations
below.
Also, I think it should be splitted with one implementation per patch, unless the
implementation is tied together (as for strchr/strchrnul for instance). Does
the vectorized routine only work for rv64?
> ---
> sysdeps/riscv/rv64/rvv/Implies | 2 +
> sysdeps/riscv/rv64/rvv/memchr.S | 127 +++++++++++++++++++
> sysdeps/riscv/rv64/rvv/memcmp.S | 93 ++++++++++++++
> sysdeps/riscv/rv64/rvv/memcpy.S | 154 +++++++++++++++++++++++
> sysdeps/riscv/rv64/rvv/memmove.c | 22 ++++
> sysdeps/riscv/rv64/rvv/memset.S | 89 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strchr.S | 92 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strchrnul.c | 22 ++++
> sysdeps/riscv/rv64/rvv/strcmp.S | 108 +++++++++++++++++
> sysdeps/riscv/rv64/rvv/strcpy.S | 72 +++++++++++
> sysdeps/riscv/rv64/rvv/strcspn.c | 22 ++++
> sysdeps/riscv/rv64/rvv/strlen.S | 67 ++++++++++
> sysdeps/riscv/rv64/rvv/strncmp.S | 104 ++++++++++++++++
> sysdeps/riscv/rv64/rvv/strncpy.S | 96 +++++++++++++++
> sysdeps/riscv/rv64/rvv/strnlen.S | 81 +++++++++++++
> sysdeps/riscv/rv64/rvv/strrchr.S | 88 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strspn.S | 189 +++++++++++++++++++++++++++++
> 17 files changed, 1428 insertions(+)
> create mode 100644 sysdeps/riscv/rv64/rvv/Implies
> create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
> create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
> create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
> create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
> create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
> create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
> create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
>
> diff --git a/sysdeps/riscv/rv64/rvv/Implies b/sysdeps/riscv/rv64/rvv/Implies
> new file mode 100644
> index 0000000000..b07b4cb906
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/Implies
> @@ -0,0 +1,2 @@
> +riscv/rv64/rvd
> +
> diff --git a/sysdeps/riscv/rv64/rvv/memchr.S b/sysdeps/riscv/rv64/rvv/memchr.S
> new file mode 100644
> index 0000000000..a7e32b8f25
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memchr.S
> @@ -0,0 +1,127 @@
> +
Spurious new line at the start. We also require a brief comment describing
the file contents for newer files.
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
Not sure 2012 range fits here.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memchr for riscv with vector extension
> + * Assumptions:
> + * - cpu becomes bandwidth limited at or before
> + * 2 vector register sized read/write operations
> + * + 2 scalar operations
> + * + conditional branch
> + */
> +
> +.globl memchr
> +.type memchr,@function
> +
> +.align 2
> +memchr:
We have the ENTRY macro for that.
> + beqz a2, .Lnot_found
Maybe use the L macro here for local labels;
> + csrr t1, vlenb
> + bgeu a2, t1, .Lvector_path /* only use vector path if we're scanning
> + at least vlenb bytes */
> +
> +#ifndef __riscv_strict_align
Would this be defined by compiler as predefine macro or is it just a debug
switch? If the later, I think it would be better to remove it.
> + li a3, 8
> + blt a2, a3, .Lbytewise
> +
> + li t1, 0x0101010101010101
> + slli a4, t1, 7 /* a4 = 0x8080808080808080 */
> + mul t2, a1, t1 /* entirety of t2 is now repeats of target character;
> + assume mul is at worst no worse than 3*(shift+OR),
> + otherwise do that instead */
> +
> +/*
> + * strategy:
> + * t4 = ((*a0) ^ t2)
> + * - now t4 contains zero bytes if and only if next word of memory
> + * had target character at those positions
> + *
> + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
> + * - all nonzero bytes of t4 become 0; zero bytes become 0x80
> + *
> + * if t4 is nonzero, find the index of the byte within it, add to a0 and return
> + * otherwise, loop
> + */
> +
> +1:
> + ld t4, (a0) /* t4 = load next 8 bytes */
> + xor t4, t4, t2
> + sub t5, t4, t1
> + not t4, t4
> + and t4, t5, t4
> + and t4, t4, a4
> + bnez t4, .Lbytewise /* could use ctzw, mod+lookup or just binary chop
> + to locate byte of interest in t4 but profiling
> + shows these approaches are at best no better */
> + addi a2, a2, -8
> + addi a0, a0, 8
> + bgeu a2, a3, 1b
> + beqz a2, .Lnot_found
> +#endif // __riscv_strict_align
> +
> +/* too little data for a dword. mask calculation and branch mispredict costs
> + make checking a word not worthwhile. degrade to bytewise search. */
> +
> +.Lbytewise:
> + add t2, a0, a2
> +
> +1:
> + lb t1, (a0)
> + beq t1, a1, .Lfound
> + addi a0, a0, 1
> + blt a0, t2, 1b
> +
> +.Lnot_found:
> + mv a0, zero
> +.Lfound:
> + ret
> +
> +.Lvector_path:
> + vsetvli t2, a2, e8, m2, ta, ma
> +
> +1:
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, a1
> + vfirst.m t3, v0
> + bgez t3, .Lvec_found
> + add a0, a0, t2
> + sub a2, a2, t2
> + bge a2, t2, 1b
> + bnez a2, 2f
> + mv a0, zero
> + ret
> +
> +2:
> + vsetvli t2, a2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, a1
> + vfirst.m t3, v0
> + bgez t3, .Lvec_found
> + mv a0, zero
> + ret
> +
> +.Lvec_found:
> + add a0, a0, t3
> + ret
> +
> +.size memchr, .-memchr
> +libc_hidden_builtin_def (memchr)
> \ No newline at end of file
Please add a newline.
> diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S b/sysdeps/riscv/rv64/rvv/strcpy.S
> new file mode 100644
> index 0000000000..b21909d66f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
> @@ -0,0 +1,72 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
You can add a optimize stpcpy and use to implement strcpy on top of that
(as my generic proposal does [1]). ARMv6 does something similar [2]
[1] https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-12-adhemerval.zanella@linaro.org/
[2] https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/arm/armv6/strcpy.S;h=e9f63a56c1c605a21b05f7ac21412585b0705171;hb=HEAD
> +#include <sysdep.h>
> +
> +.globl strcpy
> +.type strcpy,@function
> +
> +/*
> + * optimized strcpy for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strcpy:
> + mv t0, a0 /* copy dest so we can return it */
> +
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> +
> + addi t2, t1, -1 /* mask unaligned part of ptr */
> + and t2, a1, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search enough to align ptr */
> + vsetvli t2, t2, e8, m2, tu, mu
> + vle8.v v2, (a1)
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4 /* copy but not past null */
> + vfirst.m t3, v4
> + vse8.v v2, (t0), v0.t
> + bgez t3, .Ldone
> + add t0, t0, t2
> + add a1, a1, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:
> + vle8.v v2, (a1)
> + add a1, a1, t1
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4
> + vfirst.m t3, v4
> + vse8.v v2, (t0), v0.t
> + add t0, t0, t1
> + bltz t3, 1b
> +
> +.Ldone:
> + ret
> +
> +.size strcpy, .-strcpy
> +libc_hidden_builtin_def (strcpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c b/sysdeps/riscv/rv64/rvv/strcspn.c
> new file mode 100644
> index 0000000000..f0595a72fb
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +/* strcspn is implemented in strspn.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strlen.S b/sysdeps/riscv/rv64/rvv/strlen.S
> new file mode 100644
> index 0000000000..c77d500693
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strlen.S
> @@ -0,0 +1,67 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strlen
> +.type strlen,@function
> +
> +/*
> + * optimized strlen for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strlen:
> + mv t4, a0 /* copy of buffer start */
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, a0, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search fwd to align ptr */
> + vsetvli t2, t2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + add a0, a0, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, ma /* search 2*vlenb bytes per pass */
> + add t4, t4, t1
> +
> +1:
> + vle8.v v2, (a0)
> + add a0, a0, t1
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bltz t3, 1b
> +
> +.Lfound: /* found the 0; subtract */
> + sub a0, a0, t4 /* buffer start from current ptr */
> + add a0, a0, t3 /* and add offset into fetched */
> + ret /* data to get length */
> +
> +.size strlen, .-strlen
> +libc_hidden_builtin_def (strlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S b/sysdeps/riscv/rv64/rvv/strncmp.S
> new file mode 100644
> index 0000000000..863e5cb525
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
> @@ -0,0 +1,104 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strncmp
> +.type strncmp,@function
> +
> +.align 2
> +
> +/* as strcmp, but with added checks on a2 (max count)
> + */
> +
> +strncmp:
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + blt a2, t1, .Ltail /* degrade if max < vlenb*2 */
> + vsetvli zero, t1, e8, m2, ta, mu
> + vid.v v30
> + addi t2, t1, -1 /* mask unaligned part of ptr */
> + and t6, a0, t2 /* unaligned part of lhs */
> + and t5, a1, t2 /* unaligned part of rhs */
> + sub t6, t1, t6 /* safe count to read from lhs */
> + sub t5, t1, t5 /* same, rhs */
> + vmsltu.vx v28, v30, t6 /* mask for first part of lhs */
> + vmsltu.vx v26, v30, t5 /* mask for first part of rhs */
> + vmv.v.x v16, zero
> + vmv.v.x v18, zero
> +
> +
> +1: blt a2, t1, .Ltail
> + vmv.v.v v0, v28 /* lhs mask */
> + vle8.v v2, (a0), v0.t /* masked load from lhs */
> + vmseq.vx v16, v2, zero, v0.t /* check loaded bytes for null */
> + vmv.v.v v0, v26 /* rhs mask */
> + vfirst.m t2, v16 /* get lhs check result */
> + bgez t2, .Ltail /* can we safely check rest */
> + vle8.v v4, (a1), v0.t /* masked load from rhs */
> + vmseq.vx v18, v4, zero, v0.t /* check partial rhs */
> + vmnot.m v0, v28 /* mask for rest of lhs */
> + vfirst.m t3, v18 /* get check result */
> + bltz t3, 2f /* test it */
> + bge t3, t6, .Ltail
> +
> + vmsleu.vx v0, v30, t3 /* select rest of string + null */
> + vmsne.vv v0, v2, v4, v0.t /* compare */
> + vfirst.m t3, v0
> + bgez t3, 3f
> + mv a0, zero
> + ret
> +3: add a0, a0, t3
> + add a1, a1, t3
> + lbu t0, (a0)
> + lbu t1, (a1)
> +.Ldiff:
> + sub a0, t0, t1
> + ret
> +
> + /* ...no null terminator in first part of lhs or rhs */
> +2: vle8.v v2, (a0), v0.t /* load rest of lhs */
> + vmnot.m v0, v26 /* mask for rest of rhs */
> + vle8.v v4, (a1), v0.t /* load rest of rhs */
> + vmsne.vv v0, v2, v4 /* compare */
> + add a0, a0, t1 /* advance ptrs */
> + vfirst.m t3, v0
> + add a1, a1, t1
> + sub a2, a2, t1
> + bltz t3, 1b
> +
> + sub t3, t3, t1 /* found a diff but we've already advanced a0 and a1 */
> + j 3b
> +
> +.Ltail:
> + beqz a2, 1f
> + addi a2, a2, -1
> + lbu t0, (a0)
> + lbu t1, (a1)
> + bne t0, t1, .Ldiff
> + addi a0, a0, 1
> + addi a1, a1, 1
> + bnez t0, .Ltail
> +1: mv a0, zero
> + ret
> +
> +
> +.size strncmp, .-strncmp
> +libc_hidden_builtin_def (strncmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S b/sysdeps/riscv/rv64/rvv/strncpy.S
> new file mode 100644
> index 0000000000..8b3a1e545c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
> @@ -0,0 +1,96 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strncpy
> +.type strncpy,@function
> +
> +/*
> + * optimized strcpy for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strncpy:
> + mv t0, a0 /* need to return dest so copy */
> +
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> +
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, a1, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search to align the pointer */
> + vsetvli zero, t2, e8, m2, tu, mu
> + vle8.v v2, (a1)
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4 /* copy to dest */
> + vfirst.m t3, v4
> + bgeu t2, a2, .Ldest_full
> + vse8.v v2, (t0), v0.t
> + bgez t3, .Lterminator_found
> + add t0, t0, t2
> + add a1, a1, t2
> + sub a2, a2, t2
> + beqz a2, .Ldone
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:
> + vle8.v v2, (a1)
> + add a1, a1, t1
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4
> + vfirst.m t3, v4
> + bgeu t1, a2, .Ldest_full
> + vse8.v v2, (t0), v0.t
> + add t0, t0, t1
> + sub a2, a2, t1
> + bltz t3, 1b
> + sub t0, t0, t1
> +
> +.Lterminator_found:
> + addi sp, sp, -16
> + sd ra, 0(sp)
> + sd a0, 8(sp)
> + add a0, t0, t3
> + mv a1, zero
> + sub a2, a2, t3
> + jal ra, memset
> + ld ra, 0(sp)
> + ld a0, 8(sp)
> + addi sp, sp, 16
> +.Ldone:
> + ret
> +
> +.Ldest_full:
> + vid.v v6
> + vmsltu.vx v4, v6, a2
> + vmand.mm v0, v0, v4
> + vse8.v v2, (t0), v0.t
> + ret
> +
> +.size strncpy, .-strncpy
> +libc_hidden_builtin_def (strncpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S b/sysdeps/riscv/rv64/rvv/strnlen.S
> new file mode 100644
> index 0000000000..6d7ee65c7a
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
> @@ -0,0 +1,81 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
Maybe use a generic implementation that issues memchr (which should be optimized
using vector instructions) [3] ? It would be a extra function call, but it should really
help on both code size and icache pressure.
[3] https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-6-adhemerval.zanella@linaro.org/
> +.globl __strnlen
> +.type __strnlen,@function
> +
> +/* vector optimized strnlen
> + * assume it's safe to read to the end of the page
> + * containing either a null terminator or the last byte of the count or both,
> + * but not past it
> + * assume page size >= vlenb*2
> + */
> +
> +.align 2
> +__strnlen:
> + mv t4, a0 /* stash a copy of start for later */
> + beqz a1, .LzeroCount
> +
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, a1, a0
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search to align pointer to t1 */
> + bgeu t2, a1, 2f /* check it's safe */
> + mv t2, a1 /* it's not! look as far as permitted */
> +2: vsetvli t2, t2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + add a0, a0, t2
> + sub a1, a1, t2
> + bltu a1, t1, .LreachedCount
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, ma /* do 2*vlenb bytes per pass */
> +
> +1: vle8.v v2, (a0)
> + sub a1, a1, t1
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + add a0, a0, t1
> + bgeu a1, t1, 1b
> +.LreachedCount:
> + mv t2, a1 /* in case 0 < a1 < t1 */
> + bnez a1, 2b /* if so, still t2 bytes to check, all safe */
> +.LzeroCount:
> + sub a0, a0, t4
> + ret
> +
> +.Lfound: /* found the 0; subtract buffer start from current pointer */
> + add a0, a0, t3 /* and add offset into fetched data */
> + sub a0, a0, t4
> + ret
> +
> +.size __strnlen, .-__strnlen
> +weak_alias (__strnlen, strnlen)
> +libc_hidden_builtin_def (__strnlen)
> +libc_hidden_builtin_def (strnlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S b/sysdeps/riscv/rv64/rvv/strrchr.S
> new file mode 100644
> index 0000000000..4bef8a3b9c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
> @@ -0,0 +1,88 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
It is really worth to add a strrchr optimization? The generic implementation
already calls strchr (which should be optimized).
> +
> +.globl strrchr
> +.type strrchr,@function
> +
> +/*
> + * optimized strrchr for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strrchr:
> + mv t5, a0 /* stash buffer ptr somewhere safe */
> + mv a0, zero /* result is nullptr unless we find better below */
> +
> + csrr t1, vlenb /* determine vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, t5, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search to align ptr to 2*vlenb */
> + vsetvli t2, t2, e8, m2, ta, mu
> +
> + vle8.v v2, (t5) /* load data into v2(,v3) */
> + vmseq.vx v4, v2, zero /* check for null terminator */
> + vfirst.m t4, v4 /* grab its position, if any */
> + vmsbf.m v0, v4 /* select valid chars */
> + vmseq.vx v0, v2, a1, v0.t /* search for candidate byte */
> + vfirst.m t3, v0 /* grab its position, if any */
> + bltz t3, 2f /* did we find a candidate? */
> +
> +3: add a0, t3, t5 /* we did! grab the address */
> + vmsof.m v1, v0 /* there might be more than one */
> + vmandn.mm v0, v0, v1 /* so clear the one we just found */
> + vfirst.m t3, v0 /* is there another? */
> + bgez t3, 3b
> +
> +2: bgez t4, .Ldone /* did we see a null terminator? */
> + add t5, t5, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1: vle8.v v2, (t5)
> + vmseq.vx v4, v2, zero
> + vfirst.m t4, v4
> + vmsbf.m v0, v4
> + vmseq.vx v0, v2, a1, v0.t
> + vfirst.m t3, v0
> + bltz t3, 2f
> +
> +3: add a0, t3, t5
> + vmsof.m v1, v0
> + vmandn.mm v0, v0, v1
> + vfirst.m t3, v0
> + bgez t3, 3b
> +
> +2: add t5, t5, t1
> + bltz t4, 1b
> +
> +.Ldone:
> + ret
> +
> +.size strrchr, .-strrchr
> +libc_hidden_builtin_def (strrchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strspn.S b/sysdeps/riscv/rv64/rvv/strspn.S
> new file mode 100644
> index 0000000000..2b9af5cc2d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strspn.S
> @@ -0,0 +1,189 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strspn
> +.type strspn,@function
> +
> +.globl strcspn
> +.type strcspn,@function
> +
> +/*
> + * optimized strspn / strcspn for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 32
> + * strategy:
> + * - build a 256-bit table on the stack, where each elt is zero
> + * if encountering it should terminate computation and nonzero otherwise
> + * - use vectorised lookups into this to check 2*vlen elts at a time;
> + * this code is identical for strspan and strcspan and can be shared
> + *
> + * note that while V mandates at least 128 bit wide regs,
> + * we are building a 256 bit lookup table
> + * therefore we use either LMUL=1 or 2 depending on what the target supports
> + * therefore we only use even vector register numbers,
> + * so everything still works if we go with LMUL=2
> + */
> +
I wonder if we could adapt the generic implementation, so riscv only reimplements
the vectorized search instead of all the boilerplace to generate the table and
early tests.
> +# -----------------------------
> +
> +.align 2
> +
> +strspn:
> + lbu t0, 0(a1)
> + bnez t0, .Lbuild_table
> + mv a0, zero
> + ret
> +
> +.Lbuild_table:
> + mv a6, a0 /* store incoming a0 */
> + li t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
> +
> + vsetvli zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> + /* we want to build a 256-bit table, so use vlenb*2,
> + * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
> + * 'V' extension specifies a minimum vlen of 128 so this should cover
> + * all cases; we can skip the check if we know vlen >= 256 at compile time
> + */
> + csrr t2, vlenb
> + bgeu t2, t1, 1f
> + vsetvli zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> + /* read one char from the charset at a time and write the correct bit
> + * in the lookup table; we could do SIMD iff we ever get an extension
> + * that provides some way of scattering bytes into a reg group
> + */
> + vmv.v.x v16, zero /* clear out table */
> + vmv.v.x v8, zero /* clear out v8 */
> + li t3, 1
> + vmv.s.x v8, t3 /* v8 now all zeroes except bottom byte */
> +
> +1: vmv.v.x v2, zero /* clear out v2 */
> + addi a1, a1, 1 /* advance charset ptr */
> + srli t2, t0, 3 /* divide the byte we read earlier by 8 */
> + vslideup.vx v2, v8, t2 /* v2 now 1 in the correct byte 0 elsewhere */
> + vsll.vx v2, v2, t0 /* v2 now 1 in the correct bit, 0 elsewhere */
> + vor.vv v16, v16, v2 /* or it in */
> + lbu t0, 0(a1) /* fetch next bute */
> + bnez t0, 1b /* if it's null, go round again */
> +
> +/*
> + * Table is now built in v16.
> + * Strategy:
> + * - fetch next t1 bytes from memory
> + * - vrgather on their values divided by 8 to get relevant bytes of table
> + * - shift right to get the correct bit into bit 1
> + * - and with 1, compare with expected terminator value, then check mask
> + * to see if we've found a terminator
> + *
> + * Before we can begin, a0 needs to be t1-aligned, so that when we fetch
> + * the next t1 bytes - any of which may be the null terminator -
> + * we do not cross a page boundary and read unmapped memory. Therefore
> + * we have one read of however many bytes are needed to align a0,
> + * before the main loop.
> + */
> +
> +.Lscan_table:
> + vmv.v.x v8, t3 /* v8 now t1 bytes of 0x01 */
> +
> + and t2, a0, t1 /* mask to align to t1 */
> + beqz t2, 2f /* or skip if we're already aligned */
> + sub t2, t1, t2 /* t2 now bytes to read to align to t1 */
> +
> + vid.v v2 /* build mask instead of changing vl */
> + vmsltu.vx v0, v2, t2 /* so we don't need to track LMUL */
> +
> + vle8.v v2, (a0), v0.t /* load next bytes from input */
> + vsrl.vi v4, v2, 3 /* divide by 8 */
> + vrgather.vv v6, v16, v4 /* corresponding bytes of bit table */
> + vsrl.vv v6, v6, v2 /* shift correct bits to lsb */
> + vand.vv v6, v6, v8 /* and with 1 to complete the lookups */
> + vmseq.vx v4, v6, zero, v0.t /* check to see if any 0s are present */
> + vfirst.m t0, v4 /* index of the first 0, if any */
> + bgez t0, .Lscan_end /* if we found one, stop */
> + add a0, a0, t2 /* advance by number of bytes we read */
> +
> +2: add a6, a6, t1 /* we'll advance a0 before the exit check */
> +1: vle8.v v2, (a0) /* as above but unmasked so t1 elts per pass */
> + add a0, a0, t1
> +
> + vsrl.vi v4, v2, 3
> + vrgather.vv v6, v16, v4
> + vsrl.vv v6, v6, v2
> + vand.vv v6, v6, v8
> +
> + vmseq.vx v4, v6, zero
> + vfirst.m t0, v4
> + bltz t0, 1b
> +
> +.Lscan_end:
> + add a0, a0, t0 /* calculate offset to terminating byte */
> + sub a0, a0, a6
> + ret
> +.size strspn, .-strspn
> +
> +/* strcspn
> + *
> + * table build exactly as for strspn, except:
> + * - the lookup table starts with all bits except bit 0 of byte 0 set
> + * - we clear the corresponding bit for each byte in the charset
> + * once table is built, we can reuse the scan code directly
> + */
> +
> +strcspn:
> + lbu t0, 0(a1)
> + beqz t0, strlen /* no rejections -> prefix is whole string */
> +
> + mv a6, a0
> + li t1, 32
> +
> + vsetvli zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> + csrr t2, vlenb
> + bgeu t2, t1, 1f
> + vsetvli zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> + vmv.v.x v8, zero
> + li t3, 1 /* all bits clear except bit 0 of byte 0 */
> + vmv.s.x v8, t3
> + vnot.v v16, v8 /* v16 is the inverse of that */
> + li t4, -1
> +
> +1: vmv.v.x v2, zero
> + addi a1, a1, 1 /* advance charset ptr */
> + srli t2, t0, 3 /* select correct bit in v2 */
> + vslideup.vx v2, v8, t2
> + vsll.vx v2, v2, t0
> + vnot.v v2, v2 /* invert */
> + vand.vv v16, v16, v2 /* clear the relevant bit of table */
> + lbu t0, 0(a1)
> + bnez t0, 1b
> + j .Lscan_table
> +.size strcspn, .-strcspn
> +
> +libc_hidden_builtin_def (strspn)
> +libc_hidden_builtin_def (strcspn)
> \ No newline at end of file
On Wed, Feb 1, 2023 at 3:54 AM Sergei Lewis <slewis@rivosinc.com> wrote:
>
> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> targeting the riscv "V" extension, version 1.0
>
> The vectorised implementations assume VLENB of at least 128 and at least 32
> registers (as mandated by the "V" extension spec). They also assume that
> VLENB is a power of two which is no larger than the page size, and (as
> vectorised code in glibc for other platforms does) that it is safe to read
> past null terminators / buffer ends provided one does not cross a page
> boundary.
There should probably be a mention of performance gains vs the generic
implementations in the commit message to justify this.
>
> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
> ---
> sysdeps/riscv/rv64/rvv/Implies | 2 +
> sysdeps/riscv/rv64/rvv/memchr.S | 127 +++++++++++++++++++
> sysdeps/riscv/rv64/rvv/memcmp.S | 93 ++++++++++++++
> sysdeps/riscv/rv64/rvv/memcpy.S | 154 +++++++++++++++++++++++
> sysdeps/riscv/rv64/rvv/memmove.c | 22 ++++
> sysdeps/riscv/rv64/rvv/memset.S | 89 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strchr.S | 92 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strchrnul.c | 22 ++++
> sysdeps/riscv/rv64/rvv/strcmp.S | 108 +++++++++++++++++
> sysdeps/riscv/rv64/rvv/strcpy.S | 72 +++++++++++
> sysdeps/riscv/rv64/rvv/strcspn.c | 22 ++++
> sysdeps/riscv/rv64/rvv/strlen.S | 67 ++++++++++
> sysdeps/riscv/rv64/rvv/strncmp.S | 104 ++++++++++++++++
> sysdeps/riscv/rv64/rvv/strncpy.S | 96 +++++++++++++++
> sysdeps/riscv/rv64/rvv/strnlen.S | 81 +++++++++++++
> sysdeps/riscv/rv64/rvv/strrchr.S | 88 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strspn.S | 189 +++++++++++++++++++++++++++++
> 17 files changed, 1428 insertions(+)
> create mode 100644 sysdeps/riscv/rv64/rvv/Implies
> create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
> create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
> create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
> create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
> create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
> create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
> create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
>
> diff --git a/sysdeps/riscv/rv64/rvv/Implies b/sysdeps/riscv/rv64/rvv/Implies
> new file mode 100644
> index 0000000000..b07b4cb906
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/Implies
> @@ -0,0 +1,2 @@
> +riscv/rv64/rvd
> +
> diff --git a/sysdeps/riscv/rv64/rvv/memchr.S b/sysdeps/riscv/rv64/rvv/memchr.S
> new file mode 100644
> index 0000000000..a7e32b8f25
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memchr.S
> @@ -0,0 +1,127 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memchr for riscv with vector extension
> + * Assumptions:
> + * - cpu becomes bandwidth limited at or before
> + * 2 vector register sized read/write operations
> + * + 2 scalar operations
> + * + conditional branch
> + */
> +
> +.globl memchr
> +.type memchr,@function
> +
> +.align 2
> +memchr:
> + beqz a2, .Lnot_found
> + csrr t1, vlenb
> + bgeu a2, t1, .Lvector_path /* only use vector path if we're scanning
> + at least vlenb bytes */
> +
> +#ifndef __riscv_strict_align
> + li a3, 8
> + blt a2, a3, .Lbytewise
> +
> + li t1, 0x0101010101010101
> + slli a4, t1, 7 /* a4 = 0x8080808080808080 */
> + mul t2, a1, t1 /* entirety of t2 is now repeats of target character;
> + assume mul is at worst no worse than 3*(shift+OR),
> + otherwise do that instead */
> +
> +/*
> + * strategy:
> + * t4 = ((*a0) ^ t2)
> + * - now t4 contains zero bytes if and only if next word of memory
> + * had target character at those positions
> + *
> + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
> + * - all nonzero bytes of t4 become 0; zero bytes become 0x80
> + *
> + * if t4 is nonzero, find the index of the byte within it, add to a0 and return
> + * otherwise, loop
> + */
> +
> +1:
> + ld t4, (a0) /* t4 = load next 8 bytes */
> + xor t4, t4, t2
> + sub t5, t4, t1
> + not t4, t4
> + and t4, t5, t4
> + and t4, t4, a4
> + bnez t4, .Lbytewise /* could use ctzw, mod+lookup or just binary chop
> + to locate byte of interest in t4 but profiling
> + shows these approaches are at best no better */
> + addi a2, a2, -8
> + addi a0, a0, 8
> + bgeu a2, a3, 1b
> + beqz a2, .Lnot_found
> +#endif // __riscv_strict_align
> +
> +/* too little data for a dword. mask calculation and branch mispredict costs
> + make checking a word not worthwhile. degrade to bytewise search. */
> +
> +.Lbytewise:
> + add t2, a0, a2
> +
> +1:
> + lb t1, (a0)
> + beq t1, a1, .Lfound
> + addi a0, a0, 1
> + blt a0, t2, 1b
> +
> +.Lnot_found:
> + mv a0, zero
> +.Lfound:
> + ret
> +
> +.Lvector_path:
> + vsetvli t2, a2, e8, m2, ta, ma
> +
> +1:
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, a1
> + vfirst.m t3, v0
> + bgez t3, .Lvec_found
> + add a0, a0, t2
> + sub a2, a2, t2
> + bge a2, t2, 1b
> + bnez a2, 2f
> + mv a0, zero
> + ret
> +
> +2:
> + vsetvli t2, a2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, a1
> + vfirst.m t3, v0
> + bgez t3, .Lvec_found
> + mv a0, zero
> + ret
> +
> +.Lvec_found:
> + add a0, a0, t3
> + ret
> +
> +.size memchr, .-memchr
> +libc_hidden_builtin_def (memchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memcmp.S b/sysdeps/riscv/rv64/rvv/memcmp.S
> new file mode 100644
> index 0000000000..a945753a5f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memcmp.S
> @@ -0,0 +1,93 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +/* Optimised memcmp for riscv with vector extension
> + */
> +
> +.globl memcmp
> +.type memcmp,@function
> +
> +.align 2
> +
> +memcmp:
> + mv t2, zero
> + beqz a2, .Ldone
> +
> + li t1, 5 /* scalar path cheaper for 1-4 elts */
> + bltu a2, t1, .Lscalar
> +
> + /* main loop, vlenb*2 elts at a time */
> + vsetvli t1, a2, e8, m2, ta, ma
> +
> +1:
> + vle8.v v2, (a0) /* load elts */
> + vle8.v v4, (a1)
> + vmsne.vv v0, v2, v4 /* compare */
> + vfirst.m t3, v0
> + bgez t3, .Lvec_diff /* found a difference ? */
> + add a0, a0, t1 /* not yet, advance everything */
> + add a1, a1, t1
> + sub a2, a2, t1
> + bgeu a2, t1, 1b
> +
> + bnez a2, .Ltail
> + mv a0, zero
> + ret
> +
> +.Ltail:
> + /* handle tail. we know a2 < vlenb*2 so just load and compare the lot */
> + vsetvli t1, a2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vle8.v v4, (a1)
> + vmsne.vv v0, v2, v4
> + vfirst.m t3, v0
> + bgez t3, .Lvec_diff
> + mv a0, zero /* no diff found */
> + ret
> +
> +.Lvec_diff: /* v2, v4 differ at elt t3 */
> + add a0, a0, t3
> + add a1, a1, t3
> + lbu t0, (a0)
> + lbu t1, (a1)
> + sub a0, t0, t1
> + ret
> +
> +.Lscalar:
> + add t3, a0, a2
> +
> +1:
> + lbu t0, (a0)
> + lbu t1, (a1)
> + sub t2, t0, t1
> + bnez t2, .Ldone
> + addi a0, a0, 1
> + addi a1, a1, 1
> + bltu a0, t3, 1b
> +
> +.Ldone:
> + mv a0, t2
> + ret
> +
> +
> +.size memcmp, .-memcmp
> +libc_hidden_builtin_def (memcmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memcpy.S b/sysdeps/riscv/rv64/rvv/memcpy.S
> new file mode 100644
> index 0000000000..7b37ec285d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memcpy.S
> @@ -0,0 +1,154 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +/* Optimised memcpy and memmove for riscv with vector extension
> + */
> +
> +.globl memcpy
> +.type memcpy,@function
> +.globl memmove
> +.type memmove,@function
> +
> +.align 2
> +memmove:
> + bge a0, a1, .Lmemcpy_rev
> +
> +memcpy:
> +.Lmemcpy_fwd:
> + mv t0, a0 /* t0 = preserve a0 so we can return it */
> + csrr t2, vlenb /* t2 = number of bytes per vectorised copy op */
> + slli t5, t2, 1 /* t5 = number of bytes per loop */
> + addi t3, t5, -1 /* generate mask */
> + not t4, t3
> + and t4, a2, t4 /* t4 = bytes copied in vectorised pass */
> +
> + beqz t4, .Lscalar_fwd /* size too small for even one pass? */
> +
> + and a2, a2, t3 /* a2 = bytes still left to copy after pass */
> + add t4, t4, a1 /* t4 = src at end of vectorised pass */
> +
> +1:
> + vl2r.v v2, (a1) /* load, advance source */
> + add a1, a1, t5
> + vs2r.v v2, (t0) /* store, advance dest */
> + add t0, t0, t5
> + bltu a1, t4, 1b /* src at end? */
> +
> + bltu a2, t2, .Lscalar_fwd /* should we do one more vec load/store? */
> + vl1r.v v2, (a1)
> + sub a2, a2, t2
> + add a1, a1, t2
> + vs1r.v v2, (t0)
> + add t0, t0, t2
> +
> +.Lscalar_fwd:
> + bnez a2, .Lnobail
> +.Lbail:
> + ret
> +.Lnobail:
> +
> +#ifndef __riscv_strict_align
> + addi t2, zero, 4
> + bltu a2, t2, .Lsingle_bytes
> +1:
> + lw t3, 0(a1)
> + addi a1, a1, 4
> + sw t3, 0(t0)
> + addi t0, t0, 4
> + addi a2, a2, -4
> + bgeu a2, t2, 1b
> +#endif // __riscv_strict_align
> +
> +.Lsingle_bytes:
> + beqz a2, .Lbail
> + add a2, a2, a1 /* a2 = src + remaining size */
> +1:
> + lb t1, 0(a1)
> + sb t1, 0(t0)
> + addi a1, a1, 1
> + addi t0, t0, 1
> + bltu a1, a2, 1b
> + ret
> +.size memcpy, .-memcpy
> +
> +
> +.Lmemcpy_rev:
> + beq a0, a1, .Lmemcpy_rev_done
> + add t0, a0, a2 /* t0 = dest so we can return a0=dest later */
> + add t6, a1, a2 /* dest and src both point to byte */
> + /* immediately after end of buffer */
> +
> + csrr t2, vlenb /* t2 = number of bytes per pass */
> + slli t5, t2, 1 /* t5 = number of bytes per entire loop */
> + addi t3, t5, -1 /* t3 = (bytes per loop) mask */
> + not t4, t3 /* generate mask for bytes processed by loop */
> + and t4, a2, t4 /* t4 = bytes copied in vectorised pass */
> +
> + beqz t4, .Lscalar_rev /* size too small for even one pass? */
> +
> + and a2, a2, t3 /* a2 = bytes still left to copy after pass */
> + sub t4, t6, t4 /* t4 = src at end of vectorised pass */
> +
> +1:
> + sub t6, t6, t5
> + sub t0, t0, t5
> + vl2r.v v2, (t6) /* load, advance source */
> + vs2r.v v2, (t0) /* store, advance dest */
> + bgtu t6, t4, 1b /* src at end? */
> +
> + bltu a2, t2, .Lscalar_rev /* should we do one more vec load/store? */
> + sub t6, t6, t2
> + sub t0, t0, t2
> + sub a2, a2, t2
> + vl1r.v v2, (t6)
> + vs1r.v v2, (t0)
> +
> +.Lscalar_rev:
> +#ifndef __riscv_strict_align
> + beqz a2, .Lbail
> +
> + addi t2, zero, 4
> + bltu a2, t2, 2f
> +1:
> + addi t6, t6, -4
> + addi t0, t0, -4
> + addi a2, a2, -4
> + lw t3, 0(t6)
> + sw t3, 0(t0)
> + bgeu a2, t2, 1b
> +2:
> +#endif // __riscv_strict_align
> +
> + beqz a2, .Lbail
> +1:
> + addi t6, t6, -1
> + addi t0, t0, -1
> + lb t1, 0(t6)
> + sb t1, 0(t0)
> + bgtu t0, a0, 1b
> +
> +.Lmemcpy_rev_done:
> + ret
> +
> +.size memmove, .-memmove
> +libc_hidden_builtin_def (memcpy)
> +libc_hidden_builtin_def (memmove)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memmove.c b/sysdeps/riscv/rv64/rvv/memmove.c
> new file mode 100644
> index 0000000000..47734854f9
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memmove.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +/* memmove is implemented in memcpy.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/memset.S b/sysdeps/riscv/rv64/rvv/memset.S
> new file mode 100644
> index 0000000000..6f82c542b1
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memset.S
> @@ -0,0 +1,89 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memset for riscv with vector extension
> + */
> +
> +.globl memset
> +.type memset,@function
> +
> +.align 2
> +memset:
> + mv t0, a0 /* t0 = dest so we can return a0 later */
> + vsetvli t2, a2, e8, m2, ta, ma /* t2 = elts per copy */
> + beqz t2, .Lscalar
> +
> + vmv.v.x v2, a1 /* splat value across v2 */
> +
> + slli t3, t2, 1
> + bgtu t3, a2, .Lsinglestore
> +
> +1:
> + vse8.v v2, (t0)
> + add t0, t0, t2
> + vse8.v v2, (t0)
> + add t0, t0, t2
> + sub a2, a2, t3
> + bgeu a2, t3, 1b
> + bgeu a2, t2, .Lsinglestore
> + bnez a2, .Lscalar
> +
> +.Lbail:
> + ret
> +
> +.Lsinglestore:
> + bgtu t2, a2, .Lscalar
> + vse8.v v2, (t0)
> + add t0, t0, t2
> + sub a2, a2, t2
> +
> +.Lscalar:
> + beqz a2, .Lbail
> +
> +#ifndef __riscv_strict_align
> + slli t2, a1, 8
> + or a1, a1, t2
> + slli t2, a1, 16
> + or a1, a1, t2
> +
> + addi t2, zero, 4
> + bltu a2, t2, 2f
> +
> +1:
> + sw a1, 0(t0)
> + addi t0, t0, 4
> + addi a2, a2, -4
> + bgeu a2, t2, 1b
> +2:
> + beqz a2, .Lbail
> +#endif // __riscv_strict_align
> +
> + add a2, a2, t0
> +1:
> + sb a1, 0(t0)
> + addi t0, t0, 1
> + bltu t0, a2, 1b
> + ret
> +
> +.size memset, .-memset
> +libc_hidden_builtin_def (memset)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strchr.S b/sysdeps/riscv/rv64/rvv/strchr.S
> new file mode 100644
> index 0000000000..0b37174c55
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strchr.S
> @@ -0,0 +1,92 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strchr
> +.type strchr,@function
> +
> +.globl __strchrnul
> +.type __strchrnul,@function
> +
> +/*
> + * optimized strchr for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +__strchrnul:
> + li t5, -1
> + j 1f
> +
> +strchr:
> + mv t5, zero
> +1: csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of pointer */
> + and t2, a0, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search however many bytes
> + are needed to align the pointer */
> + vsetvli t2, t2, e8, m2, ta, mu
> +
> + vle8.v v2, (a0) /* load data into v2(,v3) */
> + vmseq.vx v4, v2, zero
> + vfirst.m t4, v4
> + vmsbf.m v0, v4
> + vmseq.vx v0, v2, a1, v0.t
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + bgez t4, .Lbufferend
> + add a0, a0, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu
> + li t4, -1
> +
> +1:
> + vle8.v v2, (a0)
> + vmseq.vx v4, v2, zero
> + vfirst.m t4, v4
> + vmsbf.m v0, v4
> + vmseq.vx v0, v2, a1, v0.t
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + bgez t4, .Lbufferend
> + add a0, a0, t1
> + j 1b
> +
> +.Lfound: /* found the target at a0+t3 */
> + add a0, a0, t3
> + ret
> +
> +.Lbufferend:
> + add a0, a0, t4
> + and a0, a0, t5
> + ret
> +
> +.size strchr, .-strchr
> +.size __strchrnul, .-__strchrnul
> +
> +libc_hidden_builtin_def (strchr)
> +weak_alias (__strchrnul, strchrnul)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strchrnul.c b/sysdeps/riscv/rv64/rvv/strchrnul.c
> new file mode 100644
> index 0000000000..259da80358
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strchrnul.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +/* strchrnul is implemented in strchr.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strcmp.S b/sysdeps/riscv/rv64/rvv/strcmp.S
> new file mode 100644
> index 0000000000..4a219221ac
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcmp.S
> @@ -0,0 +1,108 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strcmp
> +.type strcmp,@function
> +
> +.align 2
> +
> +/* most of the time, one or both sides is unaligned and their alignments differ
> + * we need to check for a null terminator before crossing a page boundary
> + * strategy:
> + * - for each side, calculate masks for alignment and (vlenb * 2) - alignment
> + * - while no difference encountered:
> + * - for each side:
> + * - load bytes to end of next vlenb*2 block
> + * - check for null terminator
> + * - if no terminator, load bytes to fill rest of register
> + * - compare sides
> + */
> +
> +strcmp:
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + vsetvli zero, t1, e8, m2, ta, mu
> + vid.v v30
> + addi t2, t1, -1 /* mask for unaligned part of ptr */
> + and t6, a0, t2 /* unaligned part of lhs */
> + and t5, a1, t2 /* unaligned part of rhs */
> + sub t6, t1, t6 /* safe number of lhs bytes to read */
> + sub t5, t1, t5 /* same, rhs */
> + vmsltu.vx v28, v30, t6 /* v28 = mask for first half of lhs load */
> + vmsltu.vx v26, v30, t5 /* v26 = mask for first half of rhs load */
> + vmv.v.x v16, zero
> + vmv.v.x v18, zero
> +
> +1: vmv.v.v v0, v28 /* lhs mask */
> + vle8.v v2, (a0), v0.t /* masked load from lhs */
> + vmseq.vx v16, v2, zero, v0.t /* check loaded bytes for null */
> + vmv.v.v v0, v26 /* rhs mask */
> + vfirst.m t2, v16 /* get lhs check result */
> + bgez t2, .Ltail /* bail if we can't safely check rest */
> + vle8.v v4, (a1), v0.t /* masked load from rhs */
> + vmseq.vx v18, v4, zero, v0.t /* check partial rhs for null */
> + vmnot.m v0, v28 /* mask for rest of lhs */
> + vfirst.m t3, v18 /* get check result */
> + bltz t3, 2f /* test it */
> + /* we see null terminator */
> + bge t3, t6, .Ltail /* have enough bytes for vector cmp? */
> +
> + vmsleu.vx v0, v30, t3 /* select rest + null */
> + vmsne.vv v0, v2, v4, v0.t /* compare */
> + vfirst.m t3, v0
> + bgez t3, 3f
> + mv a0, zero /* no difference */
> + ret
> +3: add a0, a0, t3
> + add a1, a1, t3
> + lbu t0, (a0)
> + lbu t1, (a1)
> +.Ldiff:
> + sub a0, t0, t1
> + ret
> +
> + /* ...no null terminator */
> +2: vle8.v v2, (a0), v0.t /* load rest of lhs */
> + vmnot.m v0, v26 /* mask for rest of rhs */
> + vle8.v v4, (a1), v0.t /* load rest of rhs */
> + vmsne.vv v0, v2, v4 /* compare */
> + add a0, a0, t1 /* advance ptrs */
> + vfirst.m t3, v0
> + add a1, a1, t1
> + bltz t3, 1b
> +
> + sub t3, t3, t1 /* found difference but we've already advanced a0 and a1 */
> + j 3b
> +
> +.Ltail:
> + lbu t0, (a0)
> + lbu t1, (a1)
> + bne t0, t1, .Ldiff
> + addi a0, a0, 1
> + addi a1, a1, 1
> + bnez t0, .Ltail
> + mv a0, zero
> + ret
> +
> +
> +.size strcmp, .-strcmp
> +libc_hidden_builtin_def (strcmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S b/sysdeps/riscv/rv64/rvv/strcpy.S
> new file mode 100644
> index 0000000000..b21909d66f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
> @@ -0,0 +1,72 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strcpy
> +.type strcpy,@function
> +
> +/*
> + * optimized strcpy for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strcpy:
> + mv t0, a0 /* copy dest so we can return it */
> +
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> +
> + addi t2, t1, -1 /* mask unaligned part of ptr */
> + and t2, a1, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search enough to align ptr */
> + vsetvli t2, t2, e8, m2, tu, mu
> + vle8.v v2, (a1)
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4 /* copy but not past null */
> + vfirst.m t3, v4
> + vse8.v v2, (t0), v0.t
> + bgez t3, .Ldone
> + add t0, t0, t2
> + add a1, a1, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:
> + vle8.v v2, (a1)
> + add a1, a1, t1
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4
> + vfirst.m t3, v4
> + vse8.v v2, (t0), v0.t
> + add t0, t0, t1
> + bltz t3, 1b
> +
> +.Ldone:
> + ret
> +
> +.size strcpy, .-strcpy
> +libc_hidden_builtin_def (strcpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c b/sysdeps/riscv/rv64/rvv/strcspn.c
> new file mode 100644
> index 0000000000..f0595a72fb
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +/* strcspn is implemented in strspn.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strlen.S b/sysdeps/riscv/rv64/rvv/strlen.S
> new file mode 100644
> index 0000000000..c77d500693
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strlen.S
> @@ -0,0 +1,67 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strlen
> +.type strlen,@function
> +
> +/*
> + * optimized strlen for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strlen:
> + mv t4, a0 /* copy of buffer start */
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, a0, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search fwd to align ptr */
> + vsetvli t2, t2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + add a0, a0, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, ma /* search 2*vlenb bytes per pass */
> + add t4, t4, t1
> +
> +1:
> + vle8.v v2, (a0)
> + add a0, a0, t1
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bltz t3, 1b
> +
> +.Lfound: /* found the 0; subtract */
> + sub a0, a0, t4 /* buffer start from current ptr */
> + add a0, a0, t3 /* and add offset into fetched */
> + ret /* data to get length */
> +
> +.size strlen, .-strlen
> +libc_hidden_builtin_def (strlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S b/sysdeps/riscv/rv64/rvv/strncmp.S
> new file mode 100644
> index 0000000000..863e5cb525
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
> @@ -0,0 +1,104 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strncmp
> +.type strncmp,@function
> +
> +.align 2
> +
> +/* as strcmp, but with added checks on a2 (max count)
> + */
> +
> +strncmp:
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + blt a2, t1, .Ltail /* degrade if max < vlenb*2 */
> + vsetvli zero, t1, e8, m2, ta, mu
> + vid.v v30
> + addi t2, t1, -1 /* mask unaligned part of ptr */
> + and t6, a0, t2 /* unaligned part of lhs */
> + and t5, a1, t2 /* unaligned part of rhs */
> + sub t6, t1, t6 /* safe count to read from lhs */
> + sub t5, t1, t5 /* same, rhs */
> + vmsltu.vx v28, v30, t6 /* mask for first part of lhs */
> + vmsltu.vx v26, v30, t5 /* mask for first part of rhs */
> + vmv.v.x v16, zero
> + vmv.v.x v18, zero
> +
> +
> +1: blt a2, t1, .Ltail
> + vmv.v.v v0, v28 /* lhs mask */
> + vle8.v v2, (a0), v0.t /* masked load from lhs */
> + vmseq.vx v16, v2, zero, v0.t /* check loaded bytes for null */
> + vmv.v.v v0, v26 /* rhs mask */
> + vfirst.m t2, v16 /* get lhs check result */
> + bgez t2, .Ltail /* can we safely check rest */
> + vle8.v v4, (a1), v0.t /* masked load from rhs */
> + vmseq.vx v18, v4, zero, v0.t /* check partial rhs */
> + vmnot.m v0, v28 /* mask for rest of lhs */
> + vfirst.m t3, v18 /* get check result */
> + bltz t3, 2f /* test it */
> + bge t3, t6, .Ltail
> +
> + vmsleu.vx v0, v30, t3 /* select rest of string + null */
> + vmsne.vv v0, v2, v4, v0.t /* compare */
> + vfirst.m t3, v0
> + bgez t3, 3f
> + mv a0, zero
> + ret
> +3: add a0, a0, t3
> + add a1, a1, t3
> + lbu t0, (a0)
> + lbu t1, (a1)
> +.Ldiff:
> + sub a0, t0, t1
> + ret
> +
> + /* ...no null terminator in first part of lhs or rhs */
> +2: vle8.v v2, (a0), v0.t /* load rest of lhs */
> + vmnot.m v0, v26 /* mask for rest of rhs */
> + vle8.v v4, (a1), v0.t /* load rest of rhs */
> + vmsne.vv v0, v2, v4 /* compare */
> + add a0, a0, t1 /* advance ptrs */
> + vfirst.m t3, v0
> + add a1, a1, t1
> + sub a2, a2, t1
> + bltz t3, 1b
> +
> + sub t3, t3, t1 /* found a diff but we've already advanced a0 and a1 */
> + j 3b
> +
> +.Ltail:
> + beqz a2, 1f
> + addi a2, a2, -1
> + lbu t0, (a0)
> + lbu t1, (a1)
> + bne t0, t1, .Ldiff
> + addi a0, a0, 1
> + addi a1, a1, 1
> + bnez t0, .Ltail
> +1: mv a0, zero
> + ret
> +
> +
> +.size strncmp, .-strncmp
> +libc_hidden_builtin_def (strncmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S b/sysdeps/riscv/rv64/rvv/strncpy.S
> new file mode 100644
> index 0000000000..8b3a1e545c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
> @@ -0,0 +1,96 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strncpy
> +.type strncpy,@function
> +
> +/*
> + * optimized strcpy for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strncpy:
> + mv t0, a0 /* need to return dest so copy */
> +
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> +
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, a1, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search to align the pointer */
> + vsetvli zero, t2, e8, m2, tu, mu
> + vle8.v v2, (a1)
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4 /* copy to dest */
> + vfirst.m t3, v4
> + bgeu t2, a2, .Ldest_full
> + vse8.v v2, (t0), v0.t
> + bgez t3, .Lterminator_found
> + add t0, t0, t2
> + add a1, a1, t2
> + sub a2, a2, t2
> + beqz a2, .Ldone
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:
> + vle8.v v2, (a1)
> + add a1, a1, t1
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4
> + vfirst.m t3, v4
> + bgeu t1, a2, .Ldest_full
> + vse8.v v2, (t0), v0.t
> + add t0, t0, t1
> + sub a2, a2, t1
> + bltz t3, 1b
> + sub t0, t0, t1
> +
> +.Lterminator_found:
> + addi sp, sp, -16
> + sd ra, 0(sp)
> + sd a0, 8(sp)
> + add a0, t0, t3
> + mv a1, zero
> + sub a2, a2, t3
> + jal ra, memset
> + ld ra, 0(sp)
> + ld a0, 8(sp)
> + addi sp, sp, 16
> +.Ldone:
> + ret
> +
> +.Ldest_full:
> + vid.v v6
> + vmsltu.vx v4, v6, a2
> + vmand.mm v0, v0, v4
> + vse8.v v2, (t0), v0.t
> + ret
> +
> +.size strncpy, .-strncpy
> +libc_hidden_builtin_def (strncpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S b/sysdeps/riscv/rv64/rvv/strnlen.S
> new file mode 100644
> index 0000000000..6d7ee65c7a
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
> @@ -0,0 +1,81 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl __strnlen
> +.type __strnlen,@function
> +
> +/* vector optimized strnlen
> + * assume it's safe to read to the end of the page
> + * containing either a null terminator or the last byte of the count or both,
> + * but not past it
> + * assume page size >= vlenb*2
> + */
> +
> +.align 2
> +__strnlen:
> + mv t4, a0 /* stash a copy of start for later */
> + beqz a1, .LzeroCount
> +
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, a1, a0
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search to align pointer to t1 */
> + bgeu t2, a1, 2f /* check it's safe */
> + mv t2, a1 /* it's not! look as far as permitted */
> +2: vsetvli t2, t2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + add a0, a0, t2
> + sub a1, a1, t2
> + bltu a1, t1, .LreachedCount
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, ma /* do 2*vlenb bytes per pass */
> +
> +1: vle8.v v2, (a0)
> + sub a1, a1, t1
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + add a0, a0, t1
> + bgeu a1, t1, 1b
> +.LreachedCount:
> + mv t2, a1 /* in case 0 < a1 < t1 */
> + bnez a1, 2b /* if so, still t2 bytes to check, all safe */
> +.LzeroCount:
> + sub a0, a0, t4
> + ret
> +
> +.Lfound: /* found the 0; subtract buffer start from current pointer */
> + add a0, a0, t3 /* and add offset into fetched data */
> + sub a0, a0, t4
> + ret
> +
> +.size __strnlen, .-__strnlen
> +weak_alias (__strnlen, strnlen)
> +libc_hidden_builtin_def (__strnlen)
> +libc_hidden_builtin_def (strnlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S b/sysdeps/riscv/rv64/rvv/strrchr.S
> new file mode 100644
> index 0000000000..4bef8a3b9c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
> @@ -0,0 +1,88 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strrchr
> +.type strrchr,@function
> +
> +/*
> + * optimized strrchr for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strrchr:
> + mv t5, a0 /* stash buffer ptr somewhere safe */
> + mv a0, zero /* result is nullptr unless we find better below */
> +
> + csrr t1, vlenb /* determine vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, t5, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search to align ptr to 2*vlenb */
> + vsetvli t2, t2, e8, m2, ta, mu
> +
> + vle8.v v2, (t5) /* load data into v2(,v3) */
> + vmseq.vx v4, v2, zero /* check for null terminator */
> + vfirst.m t4, v4 /* grab its position, if any */
> + vmsbf.m v0, v4 /* select valid chars */
> + vmseq.vx v0, v2, a1, v0.t /* search for candidate byte */
> + vfirst.m t3, v0 /* grab its position, if any */
> + bltz t3, 2f /* did we find a candidate? */
> +
> +3: add a0, t3, t5 /* we did! grab the address */
> + vmsof.m v1, v0 /* there might be more than one */
> + vmandn.mm v0, v0, v1 /* so clear the one we just found */
> + vfirst.m t3, v0 /* is there another? */
> + bgez t3, 3b
> +
> +2: bgez t4, .Ldone /* did we see a null terminator? */
> + add t5, t5, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1: vle8.v v2, (t5)
> + vmseq.vx v4, v2, zero
> + vfirst.m t4, v4
> + vmsbf.m v0, v4
> + vmseq.vx v0, v2, a1, v0.t
> + vfirst.m t3, v0
> + bltz t3, 2f
> +
> +3: add a0, t3, t5
> + vmsof.m v1, v0
> + vmandn.mm v0, v0, v1
> + vfirst.m t3, v0
> + bgez t3, 3b
> +
> +2: add t5, t5, t1
> + bltz t4, 1b
> +
> +.Ldone:
> + ret
> +
> +.size strrchr, .-strrchr
> +libc_hidden_builtin_def (strrchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strspn.S b/sysdeps/riscv/rv64/rvv/strspn.S
> new file mode 100644
> index 0000000000..2b9af5cc2d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strspn.S
> @@ -0,0 +1,189 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strspn
> +.type strspn,@function
> +
> +.globl strcspn
> +.type strcspn,@function
> +
> +/*
> + * optimized strspn / strcspn for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 32
> + * strategy:
> + * - build a 256-bit table on the stack, where each elt is zero
> + * if encountering it should terminate computation and nonzero otherwise
> + * - use vectorised lookups into this to check 2*vlen elts at a time;
> + * this code is identical for strspan and strcspan and can be shared
> + *
> + * note that while V mandates at least 128 bit wide regs,
> + * we are building a 256 bit lookup table
> + * therefore we use either LMUL=1 or 2 depending on what the target supports
> + * therefore we only use even vector register numbers,
> + * so everything still works if we go with LMUL=2
> + */
> +
> +# -----------------------------
> +
> +.align 2
> +
> +strspn:
> + lbu t0, 0(a1)
> + bnez t0, .Lbuild_table
> + mv a0, zero
> + ret
> +
> +.Lbuild_table:
> + mv a6, a0 /* store incoming a0 */
> + li t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
> +
> + vsetvli zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> + /* we want to build a 256-bit table, so use vlenb*2,
> + * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
> + * 'V' extension specifies a minimum vlen of 128 so this should cover
> + * all cases; we can skip the check if we know vlen >= 256 at compile time
> + */
> + csrr t2, vlenb
> + bgeu t2, t1, 1f
> + vsetvli zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> + /* read one char from the charset at a time and write the correct bit
> + * in the lookup table; we could do SIMD iff we ever get an extension
> + * that provides some way of scattering bytes into a reg group
> + */
> + vmv.v.x v16, zero /* clear out table */
> + vmv.v.x v8, zero /* clear out v8 */
> + li t3, 1
> + vmv.s.x v8, t3 /* v8 now all zeroes except bottom byte */
> +
> +1: vmv.v.x v2, zero /* clear out v2 */
> + addi a1, a1, 1 /* advance charset ptr */
> + srli t2, t0, 3 /* divide the byte we read earlier by 8 */
> + vslideup.vx v2, v8, t2 /* v2 now 1 in the correct byte 0 elsewhere */
> + vsll.vx v2, v2, t0 /* v2 now 1 in the correct bit, 0 elsewhere */
> + vor.vv v16, v16, v2 /* or it in */
> + lbu t0, 0(a1) /* fetch next bute */
> + bnez t0, 1b /* if it's null, go round again */
> +
> +/*
> + * Table is now built in v16.
> + * Strategy:
> + * - fetch next t1 bytes from memory
> + * - vrgather on their values divided by 8 to get relevant bytes of table
> + * - shift right to get the correct bit into bit 1
> + * - and with 1, compare with expected terminator value, then check mask
> + * to see if we've found a terminator
> + *
> + * Before we can begin, a0 needs to be t1-aligned, so that when we fetch
> + * the next t1 bytes - any of which may be the null terminator -
> + * we do not cross a page boundary and read unmapped memory. Therefore
> + * we have one read of however many bytes are needed to align a0,
> + * before the main loop.
> + */
> +
> +.Lscan_table:
> + vmv.v.x v8, t3 /* v8 now t1 bytes of 0x01 */
> +
> + and t2, a0, t1 /* mask to align to t1 */
> + beqz t2, 2f /* or skip if we're already aligned */
> + sub t2, t1, t2 /* t2 now bytes to read to align to t1 */
> +
> + vid.v v2 /* build mask instead of changing vl */
> + vmsltu.vx v0, v2, t2 /* so we don't need to track LMUL */
> +
> + vle8.v v2, (a0), v0.t /* load next bytes from input */
> + vsrl.vi v4, v2, 3 /* divide by 8 */
> + vrgather.vv v6, v16, v4 /* corresponding bytes of bit table */
> + vsrl.vv v6, v6, v2 /* shift correct bits to lsb */
> + vand.vv v6, v6, v8 /* and with 1 to complete the lookups */
> + vmseq.vx v4, v6, zero, v0.t /* check to see if any 0s are present */
> + vfirst.m t0, v4 /* index of the first 0, if any */
> + bgez t0, .Lscan_end /* if we found one, stop */
> + add a0, a0, t2 /* advance by number of bytes we read */
> +
> +2: add a6, a6, t1 /* we'll advance a0 before the exit check */
> +1: vle8.v v2, (a0) /* as above but unmasked so t1 elts per pass */
> + add a0, a0, t1
> +
> + vsrl.vi v4, v2, 3
> + vrgather.vv v6, v16, v4
> + vsrl.vv v6, v6, v2
> + vand.vv v6, v6, v8
> +
> + vmseq.vx v4, v6, zero
> + vfirst.m t0, v4
> + bltz t0, 1b
> +
> +.Lscan_end:
> + add a0, a0, t0 /* calculate offset to terminating byte */
> + sub a0, a0, a6
> + ret
> +.size strspn, .-strspn
> +
> +/* strcspn
> + *
> + * table build exactly as for strspn, except:
> + * - the lookup table starts with all bits except bit 0 of byte 0 set
> + * - we clear the corresponding bit for each byte in the charset
> + * once table is built, we can reuse the scan code directly
> + */
> +
> +strcspn:
> + lbu t0, 0(a1)
> + beqz t0, strlen /* no rejections -> prefix is whole string */
> +
> + mv a6, a0
> + li t1, 32
> +
> + vsetvli zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> + csrr t2, vlenb
> + bgeu t2, t1, 1f
> + vsetvli zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> + vmv.v.x v8, zero
> + li t3, 1 /* all bits clear except bit 0 of byte 0 */
> + vmv.s.x v8, t3
> + vnot.v v16, v8 /* v16 is the inverse of that */
> + li t4, -1
> +
> +1: vmv.v.x v2, zero
> + addi a1, a1, 1 /* advance charset ptr */
> + srli t2, t0, 3 /* select correct bit in v2 */
> + vslideup.vx v2, v8, t2
> + vsll.vx v2, v2, t0
> + vnot.v v2, v2 /* invert */
> + vand.vv v16, v16, v2 /* clear the relevant bit of table */
> + lbu t0, 0(a1)
> + bnez t0, 1b
> + j .Lscan_table
> +.size strcspn, .-strcspn
> +
> +libc_hidden_builtin_def (strspn)
> +libc_hidden_builtin_def (strcspn)
> \ No newline at end of file
> --
> 2.34.1
>
On Wed, Feb 1, 2023 at 11:38 AM Adhemerval Zanella Netto via
Libc-alpha <libc-alpha@sourceware.org> wrote:
>
>
>
> On 01/02/23 06:52, Sergei Lewis wrote:
> > Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> > strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> > targeting the riscv "V" extension, version 1.0
> >
> > The vectorised implementations assume VLENB of at least 128 and at least 32
> > registers (as mandated by the "V" extension spec). They also assume that
> > VLENB is a power of two which is no larger than the page size, and (as
> > vectorised code in glibc for other platforms does) that it is safe to read
> > past null terminators / buffer ends provided one does not cross a page
> > boundary.
> >
> > Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
>
> Some comments that might be useful since I am working the generic implementations
> below.
>
> Also, I think it should be splitted with one implementation per patch, unless the
> implementation is tied together (as for strchr/strchrnul for instance). Does
> the vectorized routine only work for rv64?
>
> > ---
> > sysdeps/riscv/rv64/rvv/Implies | 2 +
> > sysdeps/riscv/rv64/rvv/memchr.S | 127 +++++++++++++++++++
> > sysdeps/riscv/rv64/rvv/memcmp.S | 93 ++++++++++++++
> > sysdeps/riscv/rv64/rvv/memcpy.S | 154 +++++++++++++++++++++++
> > sysdeps/riscv/rv64/rvv/memmove.c | 22 ++++
> > sysdeps/riscv/rv64/rvv/memset.S | 89 ++++++++++++++
> > sysdeps/riscv/rv64/rvv/strchr.S | 92 ++++++++++++++
> > sysdeps/riscv/rv64/rvv/strchrnul.c | 22 ++++
> > sysdeps/riscv/rv64/rvv/strcmp.S | 108 +++++++++++++++++
> > sysdeps/riscv/rv64/rvv/strcpy.S | 72 +++++++++++
> > sysdeps/riscv/rv64/rvv/strcspn.c | 22 ++++
> > sysdeps/riscv/rv64/rvv/strlen.S | 67 ++++++++++
> > sysdeps/riscv/rv64/rvv/strncmp.S | 104 ++++++++++++++++
> > sysdeps/riscv/rv64/rvv/strncpy.S | 96 +++++++++++++++
> > sysdeps/riscv/rv64/rvv/strnlen.S | 81 +++++++++++++
> > sysdeps/riscv/rv64/rvv/strrchr.S | 88 ++++++++++++++
> > sysdeps/riscv/rv64/rvv/strspn.S | 189 +++++++++++++++++++++++++++++
> > 17 files changed, 1428 insertions(+)
> > create mode 100644 sysdeps/riscv/rv64/rvv/Implies
> > create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
> > create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
> > create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
> > create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
> >
> > diff --git a/sysdeps/riscv/rv64/rvv/Implies b/sysdeps/riscv/rv64/rvv/Implies
> > new file mode 100644
> > index 0000000000..b07b4cb906
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/Implies
> > @@ -0,0 +1,2 @@
> > +riscv/rv64/rvd
> > +
> > diff --git a/sysdeps/riscv/rv64/rvv/memchr.S b/sysdeps/riscv/rv64/rvv/memchr.S
> > new file mode 100644
> > index 0000000000..a7e32b8f25
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/memchr.S
> > @@ -0,0 +1,127 @@
> > +
>
> Spurious new line at the start. We also require a brief comment describing
> the file contents for newer files.
>
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
>
> Not sure 2012 range fits here.
>
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +
> > +/* Optimised memchr for riscv with vector extension
> > + * Assumptions:
> > + * - cpu becomes bandwidth limited at or before
> > + * 2 vector register sized read/write operations
> > + * + 2 scalar operations
> > + * + conditional branch
> > + */
> > +
> > +.globl memchr
> > +.type memchr,@function
> > +
> > +.align 2
> > +memchr:
>
> We have the ENTRY macro for that.
>
> > + beqz a2, .Lnot_found
>
> Maybe use the L macro here for local labels;
>
> > + csrr t1, vlenb
> > + bgeu a2, t1, .Lvector_path /* only use vector path if we're scanning
> > + at least vlenb bytes */
> > +
> > +#ifndef __riscv_strict_align
>
> Would this be defined by compiler as predefine macro or is it just a debug
> switch? If the later, I think it would be better to remove it.
>
> > + li a3, 8
> > + blt a2, a3, .Lbytewise
> > +
> > + li t1, 0x0101010101010101
> > + slli a4, t1, 7 /* a4 = 0x8080808080808080 */
> > + mul t2, a1, t1 /* entirety of t2 is now repeats of target character;
> > + assume mul is at worst no worse than 3*(shift+OR),
> > + otherwise do that instead */
> > +
> > +/*
> > + * strategy:
> > + * t4 = ((*a0) ^ t2)
> > + * - now t4 contains zero bytes if and only if next word of memory
> > + * had target character at those positions
> > + *
> > + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
> > + * - all nonzero bytes of t4 become 0; zero bytes become 0x80
> > + *
> > + * if t4 is nonzero, find the index of the byte within it, add to a0 and return
> > + * otherwise, loop
> > + */
> > +
> > +1:
> > + ld t4, (a0) /* t4 = load next 8 bytes */
> > + xor t4, t4, t2
> > + sub t5, t4, t1
> > + not t4, t4
> > + and t4, t5, t4
> > + and t4, t4, a4
> > + bnez t4, .Lbytewise /* could use ctzw, mod+lookup or just binary chop
> > + to locate byte of interest in t4 but profiling
> > + shows these approaches are at best no better */
> > + addi a2, a2, -8
> > + addi a0, a0, 8
> > + bgeu a2, a3, 1b
> > + beqz a2, .Lnot_found
> > +#endif // __riscv_strict_align
> > +
> > +/* too little data for a dword. mask calculation and branch mispredict costs
> > + make checking a word not worthwhile. degrade to bytewise search. */
> > +
> > +.Lbytewise:
> > + add t2, a0, a2
> > +
> > +1:
> > + lb t1, (a0)
> > + beq t1, a1, .Lfound
> > + addi a0, a0, 1
> > + blt a0, t2, 1b
> > +
> > +.Lnot_found:
> > + mv a0, zero
> > +.Lfound:
> > + ret
> > +
> > +.Lvector_path:
> > + vsetvli t2, a2, e8, m2, ta, ma
> > +
> > +1:
> > + vle8.v v2, (a0)
> > + vmseq.vx v0, v2, a1
> > + vfirst.m t3, v0
> > + bgez t3, .Lvec_found
> > + add a0, a0, t2
> > + sub a2, a2, t2
> > + bge a2, t2, 1b
> > + bnez a2, 2f
> > + mv a0, zero
> > + ret
> > +
> > +2:
> > + vsetvli t2, a2, e8, m2, ta, ma
> > + vle8.v v2, (a0)
> > + vmseq.vx v0, v2, a1
> > + vfirst.m t3, v0
> > + bgez t3, .Lvec_found
> > + mv a0, zero
> > + ret
> > +
> > +.Lvec_found:
> > + add a0, a0, t3
> > + ret
> > +
> > +.size memchr, .-memchr
> > +libc_hidden_builtin_def (memchr)
> > \ No newline at end of file
>
> Please add a newline.
>
> > diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S b/sysdeps/riscv/rv64/rvv/strcpy.S
> > new file mode 100644
> > index 0000000000..b21909d66f
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
> > @@ -0,0 +1,72 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
>
> You can add a optimize stpcpy and use to implement strcpy on top of that
> (as my generic proposal does [1]). ARMv6 does something similar [2]
>
> [1] https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-12-adhemerval.zanella@linaro.org/
> [2] https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/arm/armv6/strcpy.S;h=e9f63a56c1c605a21b05f7ac21412585b0705171;hb=HEAD
>
> > +#include <sysdep.h>
> > +
> > +.globl strcpy
> > +.type strcpy,@function
> > +
> > +/*
> > + * optimized strcpy for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 2*vlenb
> > + */
> > +
> > +.align 2
> > +strcpy:
> > + mv t0, a0 /* copy dest so we can return it */
> > +
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > +
> > + addi t2, t1, -1 /* mask unaligned part of ptr */
> > + and t2, a1, t2
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search enough to align ptr */
> > + vsetvli t2, t2, e8, m2, tu, mu
> > + vle8.v v2, (a1)
> > + vmseq.vx v4, v2, zero
> > + vmsif.m v0, v4 /* copy but not past null */
> > + vfirst.m t3, v4
> > + vse8.v v2, (t0), v0.t
> > + bgez t3, .Ldone
> > + add t0, t0, t2
> > + add a1, a1, t2
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> > +
> > +1:
> > + vle8.v v2, (a1)
> > + add a1, a1, t1
> > + vmseq.vx v4, v2, zero
> > + vmsif.m v0, v4
> > + vfirst.m t3, v4
> > + vse8.v v2, (t0), v0.t
> > + add t0, t0, t1
> > + bltz t3, 1b
> > +
> > +.Ldone:
> > + ret
> > +
> > +.size strcpy, .-strcpy
> > +libc_hidden_builtin_def (strcpy)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c b/sysdeps/riscv/rv64/rvv/strcspn.c
> > new file mode 100644
> > index 0000000000..f0595a72fb
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
> > @@ -0,0 +1,22 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +/* strcspn is implemented in strspn.S
> > + */
> > diff --git a/sysdeps/riscv/rv64/rvv/strlen.S b/sysdeps/riscv/rv64/rvv/strlen.S
> > new file mode 100644
> > index 0000000000..c77d500693
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strlen.S
> > @@ -0,0 +1,67 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl strlen
> > +.type strlen,@function
> > +
> > +/*
> > + * optimized strlen for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 2*vlenb
> > + */
> > +
> > +.align 2
> > +strlen:
> > + mv t4, a0 /* copy of buffer start */
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > + addi t2, t1, -1 /* mask off unaligned part of ptr */
> > + and t2, a0, t2
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search fwd to align ptr */
> > + vsetvli t2, t2, e8, m2, ta, ma
> > + vle8.v v2, (a0)
> > + vmseq.vx v0, v2, zero
> > + vfirst.m t3, v0
> > + bgez t3, .Lfound
> > + add a0, a0, t2
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, ma /* search 2*vlenb bytes per pass */
> > + add t4, t4, t1
> > +
> > +1:
> > + vle8.v v2, (a0)
> > + add a0, a0, t1
> > + vmseq.vx v0, v2, zero
> > + vfirst.m t3, v0
> > + bltz t3, 1b
> > +
> > +.Lfound: /* found the 0; subtract */
> > + sub a0, a0, t4 /* buffer start from current ptr */
> > + add a0, a0, t3 /* and add offset into fetched */
> > + ret /* data to get length */
> > +
> > +.size strlen, .-strlen
> > +libc_hidden_builtin_def (strlen)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S b/sysdeps/riscv/rv64/rvv/strncmp.S
> > new file mode 100644
> > index 0000000000..863e5cb525
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
> > @@ -0,0 +1,104 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl strncmp
> > +.type strncmp,@function
> > +
> > +.align 2
> > +
> > +/* as strcmp, but with added checks on a2 (max count)
> > + */
> > +
> > +strncmp:
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > + blt a2, t1, .Ltail /* degrade if max < vlenb*2 */
> > + vsetvli zero, t1, e8, m2, ta, mu
> > + vid.v v30
> > + addi t2, t1, -1 /* mask unaligned part of ptr */
> > + and t6, a0, t2 /* unaligned part of lhs */
> > + and t5, a1, t2 /* unaligned part of rhs */
> > + sub t6, t1, t6 /* safe count to read from lhs */
> > + sub t5, t1, t5 /* same, rhs */
> > + vmsltu.vx v28, v30, t6 /* mask for first part of lhs */
> > + vmsltu.vx v26, v30, t5 /* mask for first part of rhs */
> > + vmv.v.x v16, zero
> > + vmv.v.x v18, zero
> > +
> > +
> > +1: blt a2, t1, .Ltail
> > + vmv.v.v v0, v28 /* lhs mask */
> > + vle8.v v2, (a0), v0.t /* masked load from lhs */
> > + vmseq.vx v16, v2, zero, v0.t /* check loaded bytes for null */
> > + vmv.v.v v0, v26 /* rhs mask */
> > + vfirst.m t2, v16 /* get lhs check result */
> > + bgez t2, .Ltail /* can we safely check rest */
> > + vle8.v v4, (a1), v0.t /* masked load from rhs */
> > + vmseq.vx v18, v4, zero, v0.t /* check partial rhs */
> > + vmnot.m v0, v28 /* mask for rest of lhs */
> > + vfirst.m t3, v18 /* get check result */
> > + bltz t3, 2f /* test it */
> > + bge t3, t6, .Ltail
> > +
> > + vmsleu.vx v0, v30, t3 /* select rest of string + null */
> > + vmsne.vv v0, v2, v4, v0.t /* compare */
> > + vfirst.m t3, v0
> > + bgez t3, 3f
> > + mv a0, zero
> > + ret
> > +3: add a0, a0, t3
> > + add a1, a1, t3
> > + lbu t0, (a0)
> > + lbu t1, (a1)
> > +.Ldiff:
> > + sub a0, t0, t1
> > + ret
> > +
> > + /* ...no null terminator in first part of lhs or rhs */
> > +2: vle8.v v2, (a0), v0.t /* load rest of lhs */
> > + vmnot.m v0, v26 /* mask for rest of rhs */
> > + vle8.v v4, (a1), v0.t /* load rest of rhs */
> > + vmsne.vv v0, v2, v4 /* compare */
> > + add a0, a0, t1 /* advance ptrs */
> > + vfirst.m t3, v0
> > + add a1, a1, t1
> > + sub a2, a2, t1
> > + bltz t3, 1b
> > +
> > + sub t3, t3, t1 /* found a diff but we've already advanced a0 and a1 */
> > + j 3b
> > +
> > +.Ltail:
> > + beqz a2, 1f
> > + addi a2, a2, -1
> > + lbu t0, (a0)
> > + lbu t1, (a1)
> > + bne t0, t1, .Ldiff
> > + addi a0, a0, 1
> > + addi a1, a1, 1
> > + bnez t0, .Ltail
> > +1: mv a0, zero
> > + ret
> > +
> > +
> > +.size strncmp, .-strncmp
> > +libc_hidden_builtin_def (strncmp)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S b/sysdeps/riscv/rv64/rvv/strncpy.S
> > new file mode 100644
> > index 0000000000..8b3a1e545c
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
> > @@ -0,0 +1,96 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl strncpy
> > +.type strncpy,@function
> > +
> > +/*
> > + * optimized strcpy for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 2*vlenb
> > + */
> > +
> > +.align 2
> > +strncpy:
> > + mv t0, a0 /* need to return dest so copy */
> > +
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > +
> > + addi t2, t1, -1 /* mask off unaligned part of ptr */
> > + and t2, a1, t2
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search to align the pointer */
> > + vsetvli zero, t2, e8, m2, tu, mu
> > + vle8.v v2, (a1)
> > + vmseq.vx v4, v2, zero
> > + vmsif.m v0, v4 /* copy to dest */
> > + vfirst.m t3, v4
> > + bgeu t2, a2, .Ldest_full
> > + vse8.v v2, (t0), v0.t
> > + bgez t3, .Lterminator_found
> > + add t0, t0, t2
> > + add a1, a1, t2
> > + sub a2, a2, t2
> > + beqz a2, .Ldone
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> > +
> > +1:
> > + vle8.v v2, (a1)
> > + add a1, a1, t1
> > + vmseq.vx v4, v2, zero
> > + vmsif.m v0, v4
> > + vfirst.m t3, v4
> > + bgeu t1, a2, .Ldest_full
> > + vse8.v v2, (t0), v0.t
> > + add t0, t0, t1
> > + sub a2, a2, t1
> > + bltz t3, 1b
> > + sub t0, t0, t1
> > +
> > +.Lterminator_found:
> > + addi sp, sp, -16
> > + sd ra, 0(sp)
> > + sd a0, 8(sp)
> > + add a0, t0, t3
> > + mv a1, zero
> > + sub a2, a2, t3
> > + jal ra, memset
> > + ld ra, 0(sp)
> > + ld a0, 8(sp)
> > + addi sp, sp, 16
> > +.Ldone:
> > + ret
> > +
> > +.Ldest_full:
> > + vid.v v6
> > + vmsltu.vx v4, v6, a2
> > + vmand.mm v0, v0, v4
> > + vse8.v v2, (t0), v0.t
> > + ret
> > +
> > +.size strncpy, .-strncpy
> > +libc_hidden_builtin_def (strncpy)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S b/sysdeps/riscv/rv64/rvv/strnlen.S
> > new file mode 100644
> > index 0000000000..6d7ee65c7a
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
> > @@ -0,0 +1,81 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
>
> Maybe use a generic implementation that issues memchr (which should be optimized
> using vector instructions) [3] ? It would be a extra function call, but it should really
> help on both code size and icache pressure.
>
> [3] https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-6-adhemerval.zanella@linaro.org/
>
> > +.globl __strnlen
> > +.type __strnlen,@function
> > +
> > +/* vector optimized strnlen
> > + * assume it's safe to read to the end of the page
> > + * containing either a null terminator or the last byte of the count or both,
> > + * but not past it
> > + * assume page size >= vlenb*2
> > + */
> > +
> > +.align 2
> > +__strnlen:
> > + mv t4, a0 /* stash a copy of start for later */
> > + beqz a1, .LzeroCount
> > +
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > + addi t2, t1, -1 /* mask off unaligned part of ptr */
> > + and t2, a1, a0
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search to align pointer to t1 */
> > + bgeu t2, a1, 2f /* check it's safe */
> > + mv t2, a1 /* it's not! look as far as permitted */
> > +2: vsetvli t2, t2, e8, m2, ta, ma
> > + vle8.v v2, (a0)
> > + vmseq.vx v0, v2, zero
> > + vfirst.m t3, v0
> > + bgez t3, .Lfound
> > + add a0, a0, t2
> > + sub a1, a1, t2
> > + bltu a1, t1, .LreachedCount
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, ma /* do 2*vlenb bytes per pass */
> > +
> > +1: vle8.v v2, (a0)
> > + sub a1, a1, t1
> > + vmseq.vx v0, v2, zero
> > + vfirst.m t3, v0
> > + bgez t3, .Lfound
> > + add a0, a0, t1
> > + bgeu a1, t1, 1b
> > +.LreachedCount:
> > + mv t2, a1 /* in case 0 < a1 < t1 */
> > + bnez a1, 2b /* if so, still t2 bytes to check, all safe */
> > +.LzeroCount:
> > + sub a0, a0, t4
> > + ret
> > +
> > +.Lfound: /* found the 0; subtract buffer start from current pointer */
> > + add a0, a0, t3 /* and add offset into fetched data */
> > + sub a0, a0, t4
> > + ret
> > +
> > +.size __strnlen, .-__strnlen
> > +weak_alias (__strnlen, strnlen)
> > +libc_hidden_builtin_def (__strnlen)
> > +libc_hidden_builtin_def (strnlen)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S b/sysdeps/riscv/rv64/rvv/strrchr.S
> > new file mode 100644
> > index 0000000000..4bef8a3b9c
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
> > @@ -0,0 +1,88 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <sysdep.h>
>
> It is really worth to add a strrchr optimization? The generic implementation
> already calls strchr (which should be optimized).
>
> > +
> > +.globl strrchr
> > +.type strrchr,@function
> > +
> > +/*
> > + * optimized strrchr for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 2*vlenb
> > + */
> > +
> > +.align 2
> > +strrchr:
> > + mv t5, a0 /* stash buffer ptr somewhere safe */
> > + mv a0, zero /* result is nullptr unless we find better below */
> > +
> > + csrr t1, vlenb /* determine vlenb*2 */
> > + add t1, t1, t1
> > + addi t2, t1, -1 /* mask off unaligned part of ptr */
> > + and t2, t5, t2
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search to align ptr to 2*vlenb */
> > + vsetvli t2, t2, e8, m2, ta, mu
> > +
> > + vle8.v v2, (t5) /* load data into v2(,v3) */
> > + vmseq.vx v4, v2, zero /* check for null terminator */
> > + vfirst.m t4, v4 /* grab its position, if any */
> > + vmsbf.m v0, v4 /* select valid chars */
> > + vmseq.vx v0, v2, a1, v0.t /* search for candidate byte */
> > + vfirst.m t3, v0 /* grab its position, if any */
> > + bltz t3, 2f /* did we find a candidate? */
> > +
> > +3: add a0, t3, t5 /* we did! grab the address */
> > + vmsof.m v1, v0 /* there might be more than one */
> > + vmandn.mm v0, v0, v1 /* so clear the one we just found */
> > + vfirst.m t3, v0 /* is there another? */
> > + bgez t3, 3b
> > +
> > +2: bgez t4, .Ldone /* did we see a null terminator? */
> > + add t5, t5, t2
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> > +
> > +1: vle8.v v2, (t5)
> > + vmseq.vx v4, v2, zero
> > + vfirst.m t4, v4
> > + vmsbf.m v0, v4
> > + vmseq.vx v0, v2, a1, v0.t
> > + vfirst.m t3, v0
> > + bltz t3, 2f
> > +
> > +3: add a0, t3, t5
> > + vmsof.m v1, v0
> > + vmandn.mm v0, v0, v1
> > + vfirst.m t3, v0
> > + bgez t3, 3b
> > +
> > +2: add t5, t5, t1
> > + bltz t4, 1b
> > +
> > +.Ldone:
> > + ret
> > +
> > +.size strrchr, .-strrchr
> > +libc_hidden_builtin_def (strrchr)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strspn.S b/sysdeps/riscv/rv64/rvv/strspn.S
> > new file mode 100644
> > index 0000000000..2b9af5cc2d
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strspn.S
> > @@ -0,0 +1,189 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl strspn
> > +.type strspn,@function
> > +
> > +.globl strcspn
> > +.type strcspn,@function
> > +
> > +/*
> > + * optimized strspn / strcspn for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 32
> > + * strategy:
> > + * - build a 256-bit table on the stack, where each elt is zero
> > + * if encountering it should terminate computation and nonzero otherwise
> > + * - use vectorised lookups into this to check 2*vlen elts at a time;
> > + * this code is identical for strspan and strcspan and can be shared
> > + *
> > + * note that while V mandates at least 128 bit wide regs,
> > + * we are building a 256 bit lookup table
> > + * therefore we use either LMUL=1 or 2 depending on what the target supports
> > + * therefore we only use even vector register numbers,
> > + * so everything still works if we go with LMUL=2
> > + */
> > +
>
> I wonder if we could adapt the generic implementation, so riscv only reimplements
> the vectorized search instead of all the boilerplace to generate the table and
> early tests.
+1
>
> > +# -----------------------------
> > +
> > +.align 2
> > +
> > +strspn:
> > + lbu t0, 0(a1)
> > + bnez t0, .Lbuild_table
> > + mv a0, zero
> > + ret
> > +
> > +.Lbuild_table:
> > + mv a6, a0 /* store incoming a0 */
> > + li t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
> > +
> > + vsetvli zero, t1, e8, m1, tu, mu
> > +#if __riscv_v_min_vlen < 256
> > + /* we want to build a 256-bit table, so use vlenb*2,
> > + * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
> > + * 'V' extension specifies a minimum vlen of 128 so this should cover
> > + * all cases; we can skip the check if we know vlen >= 256 at compile time
> > + */
> > + csrr t2, vlenb
> > + bgeu t2, t1, 1f
> > + vsetvli zero, t1, e8, m2, tu, mu
> > +1:
> > +#endif // __riscv_v_min_vlen
> > +
> > + /* read one char from the charset at a time and write the correct bit
> > + * in the lookup table; we could do SIMD iff we ever get an extension
> > + * that provides some way of scattering bytes into a reg group
> > + */
> > + vmv.v.x v16, zero /* clear out table */
> > + vmv.v.x v8, zero /* clear out v8 */
> > + li t3, 1
> > + vmv.s.x v8, t3 /* v8 now all zeroes except bottom byte */
> > +
> > +1: vmv.v.x v2, zero /* clear out v2 */
> > + addi a1, a1, 1 /* advance charset ptr */
> > + srli t2, t0, 3 /* divide the byte we read earlier by 8 */
> > + vslideup.vx v2, v8, t2 /* v2 now 1 in the correct byte 0 elsewhere */
> > + vsll.vx v2, v2, t0 /* v2 now 1 in the correct bit, 0 elsewhere */
> > + vor.vv v16, v16, v2 /* or it in */
> > + lbu t0, 0(a1) /* fetch next bute */
> > + bnez t0, 1b /* if it's null, go round again */
> > +
> > +/*
> > + * Table is now built in v16.
> > + * Strategy:
> > + * - fetch next t1 bytes from memory
> > + * - vrgather on their values divided by 8 to get relevant bytes of table
> > + * - shift right to get the correct bit into bit 1
> > + * - and with 1, compare with expected terminator value, then check mask
> > + * to see if we've found a terminator
> > + *
> > + * Before we can begin, a0 needs to be t1-aligned, so that when we fetch
> > + * the next t1 bytes - any of which may be the null terminator -
> > + * we do not cross a page boundary and read unmapped memory. Therefore
> > + * we have one read of however many bytes are needed to align a0,
> > + * before the main loop.
> > + */
> > +
> > +.Lscan_table:
> > + vmv.v.x v8, t3 /* v8 now t1 bytes of 0x01 */
> > +
> > + and t2, a0, t1 /* mask to align to t1 */
> > + beqz t2, 2f /* or skip if we're already aligned */
> > + sub t2, t1, t2 /* t2 now bytes to read to align to t1 */
> > +
> > + vid.v v2 /* build mask instead of changing vl */
> > + vmsltu.vx v0, v2, t2 /* so we don't need to track LMUL */
> > +
> > + vle8.v v2, (a0), v0.t /* load next bytes from input */
> > + vsrl.vi v4, v2, 3 /* divide by 8 */
> > + vrgather.vv v6, v16, v4 /* corresponding bytes of bit table */
> > + vsrl.vv v6, v6, v2 /* shift correct bits to lsb */
> > + vand.vv v6, v6, v8 /* and with 1 to complete the lookups */
> > + vmseq.vx v4, v6, zero, v0.t /* check to see if any 0s are present */
> > + vfirst.m t0, v4 /* index of the first 0, if any */
> > + bgez t0, .Lscan_end /* if we found one, stop */
> > + add a0, a0, t2 /* advance by number of bytes we read */
> > +
> > +2: add a6, a6, t1 /* we'll advance a0 before the exit check */
> > +1: vle8.v v2, (a0) /* as above but unmasked so t1 elts per pass */
> > + add a0, a0, t1
> > +
> > + vsrl.vi v4, v2, 3
> > + vrgather.vv v6, v16, v4
> > + vsrl.vv v6, v6, v2
> > + vand.vv v6, v6, v8
> > +
> > + vmseq.vx v4, v6, zero
> > + vfirst.m t0, v4
> > + bltz t0, 1b
> > +
> > +.Lscan_end:
> > + add a0, a0, t0 /* calculate offset to terminating byte */
> > + sub a0, a0, a6
> > + ret
> > +.size strspn, .-strspn
> > +
> > +/* strcspn
> > + *
> > + * table build exactly as for strspn, except:
> > + * - the lookup table starts with all bits except bit 0 of byte 0 set
> > + * - we clear the corresponding bit for each byte in the charset
> > + * once table is built, we can reuse the scan code directly
> > + */
> > +
> > +strcspn:
> > + lbu t0, 0(a1)
> > + beqz t0, strlen /* no rejections -> prefix is whole string */
> > +
> > + mv a6, a0
> > + li t1, 32
> > +
> > + vsetvli zero, t1, e8, m1, tu, mu
> > +#if __riscv_v_min_vlen < 256
> > + csrr t2, vlenb
> > + bgeu t2, t1, 1f
> > + vsetvli zero, t1, e8, m2, tu, mu
> > +1:
> > +#endif // __riscv_v_min_vlen
> > +
> > + vmv.v.x v8, zero
> > + li t3, 1 /* all bits clear except bit 0 of byte 0 */
> > + vmv.s.x v8, t3
> > + vnot.v v16, v8 /* v16 is the inverse of that */
> > + li t4, -1
> > +
> > +1: vmv.v.x v2, zero
> > + addi a1, a1, 1 /* advance charset ptr */
> > + srli t2, t0, 3 /* select correct bit in v2 */
> > + vslideup.vx v2, v8, t2
> > + vsll.vx v2, v2, t0
> > + vnot.v v2, v2 /* invert */
> > + vand.vv v16, v16, v2 /* clear the relevant bit of table */
> > + lbu t0, 0(a1)
> > + bnez t0, 1b
> > + j .Lscan_table
> > +.size strcspn, .-strcspn
> > +
> > +libc_hidden_builtin_def (strspn)
> > +libc_hidden_builtin_def (strcspn)
> > \ No newline at end of file
On Wed, Feb 1, 2023 at 1:54 AM Sergei Lewis <slewis@rivosinc.com> wrote:
>
> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> targeting the riscv "V" extension, version 1.0
>
> The vectorised implementations assume VLENB of at least 128 and at least 32
> registers (as mandated by the "V" extension spec). They also assume that
> VLENB is a power of two which is no larger than the page size
Standard software shouldn't be arbitrarily imposing upper bounds on
VLEN. (The assumption that VLEN >= 128 is valid, because that's
mandated by the V extension specification.)
There are already RISC-V vector supercomputer proposals that rub up
against this limit, or maybe even exceed it. glibc shouldn't be the
place where we decide such implementations are nonconforming
(especially if the outcome is to unexpectedly fail at runtime).
The intended mechanism to vectorize string routines is the
fault-only-first loads. These also result in much simpler code. See
the sample code for strlen, for example, which doesn't need to make
assumptions about VLEN or about the page size. There are other string
examples in the same directory.
https://github.com/riscv/riscv-v-spec/blob/6673ce8b1df3126cf250b8cbf422329f257adf08/example/strlen.s
Please do not merge this patch set as-is.
> , and (as
> vectorised code in glibc for other platforms does) that it is safe to read
> past null terminators / buffer ends provided one does not cross a page
> boundary.
>
> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
> ---
> sysdeps/riscv/rv64/rvv/Implies | 2 +
> sysdeps/riscv/rv64/rvv/memchr.S | 127 +++++++++++++++++++
> sysdeps/riscv/rv64/rvv/memcmp.S | 93 ++++++++++++++
> sysdeps/riscv/rv64/rvv/memcpy.S | 154 +++++++++++++++++++++++
> sysdeps/riscv/rv64/rvv/memmove.c | 22 ++++
> sysdeps/riscv/rv64/rvv/memset.S | 89 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strchr.S | 92 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strchrnul.c | 22 ++++
> sysdeps/riscv/rv64/rvv/strcmp.S | 108 +++++++++++++++++
> sysdeps/riscv/rv64/rvv/strcpy.S | 72 +++++++++++
> sysdeps/riscv/rv64/rvv/strcspn.c | 22 ++++
> sysdeps/riscv/rv64/rvv/strlen.S | 67 ++++++++++
> sysdeps/riscv/rv64/rvv/strncmp.S | 104 ++++++++++++++++
> sysdeps/riscv/rv64/rvv/strncpy.S | 96 +++++++++++++++
> sysdeps/riscv/rv64/rvv/strnlen.S | 81 +++++++++++++
> sysdeps/riscv/rv64/rvv/strrchr.S | 88 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strspn.S | 189 +++++++++++++++++++++++++++++
> 17 files changed, 1428 insertions(+)
> create mode 100644 sysdeps/riscv/rv64/rvv/Implies
> create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
> create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
> create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
> create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
> create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
> create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
> create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
>
> diff --git a/sysdeps/riscv/rv64/rvv/Implies b/sysdeps/riscv/rv64/rvv/Implies
> new file mode 100644
> index 0000000000..b07b4cb906
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/Implies
> @@ -0,0 +1,2 @@
> +riscv/rv64/rvd
> +
> diff --git a/sysdeps/riscv/rv64/rvv/memchr.S b/sysdeps/riscv/rv64/rvv/memchr.S
> new file mode 100644
> index 0000000000..a7e32b8f25
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memchr.S
> @@ -0,0 +1,127 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memchr for riscv with vector extension
> + * Assumptions:
> + * - cpu becomes bandwidth limited at or before
> + * 2 vector register sized read/write operations
> + * + 2 scalar operations
> + * + conditional branch
> + */
> +
> +.globl memchr
> +.type memchr,@function
> +
> +.align 2
> +memchr:
> + beqz a2, .Lnot_found
> + csrr t1, vlenb
> + bgeu a2, t1, .Lvector_path /* only use vector path if we're scanning
> + at least vlenb bytes */
> +
> +#ifndef __riscv_strict_align
> + li a3, 8
> + blt a2, a3, .Lbytewise
> +
> + li t1, 0x0101010101010101
> + slli a4, t1, 7 /* a4 = 0x8080808080808080 */
> + mul t2, a1, t1 /* entirety of t2 is now repeats of target character;
> + assume mul is at worst no worse than 3*(shift+OR),
> + otherwise do that instead */
> +
> +/*
> + * strategy:
> + * t4 = ((*a0) ^ t2)
> + * - now t4 contains zero bytes if and only if next word of memory
> + * had target character at those positions
> + *
> + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
> + * - all nonzero bytes of t4 become 0; zero bytes become 0x80
> + *
> + * if t4 is nonzero, find the index of the byte within it, add to a0 and return
> + * otherwise, loop
> + */
> +
> +1:
> + ld t4, (a0) /* t4 = load next 8 bytes */
> + xor t4, t4, t2
> + sub t5, t4, t1
> + not t4, t4
> + and t4, t5, t4
> + and t4, t4, a4
> + bnez t4, .Lbytewise /* could use ctzw, mod+lookup or just binary chop
> + to locate byte of interest in t4 but profiling
> + shows these approaches are at best no better */
> + addi a2, a2, -8
> + addi a0, a0, 8
> + bgeu a2, a3, 1b
> + beqz a2, .Lnot_found
> +#endif // __riscv_strict_align
> +
> +/* too little data for a dword. mask calculation and branch mispredict costs
> + make checking a word not worthwhile. degrade to bytewise search. */
> +
> +.Lbytewise:
> + add t2, a0, a2
> +
> +1:
> + lb t1, (a0)
> + beq t1, a1, .Lfound
> + addi a0, a0, 1
> + blt a0, t2, 1b
> +
> +.Lnot_found:
> + mv a0, zero
> +.Lfound:
> + ret
> +
> +.Lvector_path:
> + vsetvli t2, a2, e8, m2, ta, ma
> +
> +1:
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, a1
> + vfirst.m t3, v0
> + bgez t3, .Lvec_found
> + add a0, a0, t2
> + sub a2, a2, t2
> + bge a2, t2, 1b
> + bnez a2, 2f
> + mv a0, zero
> + ret
> +
> +2:
> + vsetvli t2, a2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, a1
> + vfirst.m t3, v0
> + bgez t3, .Lvec_found
> + mv a0, zero
> + ret
> +
> +.Lvec_found:
> + add a0, a0, t3
> + ret
> +
> +.size memchr, .-memchr
> +libc_hidden_builtin_def (memchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memcmp.S b/sysdeps/riscv/rv64/rvv/memcmp.S
> new file mode 100644
> index 0000000000..a945753a5f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memcmp.S
> @@ -0,0 +1,93 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +/* Optimised memcmp for riscv with vector extension
> + */
> +
> +.globl memcmp
> +.type memcmp,@function
> +
> +.align 2
> +
> +memcmp:
> + mv t2, zero
> + beqz a2, .Ldone
> +
> + li t1, 5 /* scalar path cheaper for 1-4 elts */
> + bltu a2, t1, .Lscalar
> +
> + /* main loop, vlenb*2 elts at a time */
> + vsetvli t1, a2, e8, m2, ta, ma
> +
> +1:
> + vle8.v v2, (a0) /* load elts */
> + vle8.v v4, (a1)
> + vmsne.vv v0, v2, v4 /* compare */
> + vfirst.m t3, v0
> + bgez t3, .Lvec_diff /* found a difference ? */
> + add a0, a0, t1 /* not yet, advance everything */
> + add a1, a1, t1
> + sub a2, a2, t1
> + bgeu a2, t1, 1b
> +
> + bnez a2, .Ltail
> + mv a0, zero
> + ret
> +
> +.Ltail:
> + /* handle tail. we know a2 < vlenb*2 so just load and compare the lot */
> + vsetvli t1, a2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vle8.v v4, (a1)
> + vmsne.vv v0, v2, v4
> + vfirst.m t3, v0
> + bgez t3, .Lvec_diff
> + mv a0, zero /* no diff found */
> + ret
> +
> +.Lvec_diff: /* v2, v4 differ at elt t3 */
> + add a0, a0, t3
> + add a1, a1, t3
> + lbu t0, (a0)
> + lbu t1, (a1)
> + sub a0, t0, t1
> + ret
> +
> +.Lscalar:
> + add t3, a0, a2
> +
> +1:
> + lbu t0, (a0)
> + lbu t1, (a1)
> + sub t2, t0, t1
> + bnez t2, .Ldone
> + addi a0, a0, 1
> + addi a1, a1, 1
> + bltu a0, t3, 1b
> +
> +.Ldone:
> + mv a0, t2
> + ret
> +
> +
> +.size memcmp, .-memcmp
> +libc_hidden_builtin_def (memcmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memcpy.S b/sysdeps/riscv/rv64/rvv/memcpy.S
> new file mode 100644
> index 0000000000..7b37ec285d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memcpy.S
> @@ -0,0 +1,154 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +/* Optimised memcpy and memmove for riscv with vector extension
> + */
> +
> +.globl memcpy
> +.type memcpy,@function
> +.globl memmove
> +.type memmove,@function
> +
> +.align 2
> +memmove:
> + bge a0, a1, .Lmemcpy_rev
> +
> +memcpy:
> +.Lmemcpy_fwd:
> + mv t0, a0 /* t0 = preserve a0 so we can return it */
> + csrr t2, vlenb /* t2 = number of bytes per vectorised copy op */
> + slli t5, t2, 1 /* t5 = number of bytes per loop */
> + addi t3, t5, -1 /* generate mask */
> + not t4, t3
> + and t4, a2, t4 /* t4 = bytes copied in vectorised pass */
> +
> + beqz t4, .Lscalar_fwd /* size too small for even one pass? */
> +
> + and a2, a2, t3 /* a2 = bytes still left to copy after pass */
> + add t4, t4, a1 /* t4 = src at end of vectorised pass */
> +
> +1:
> + vl2r.v v2, (a1) /* load, advance source */
> + add a1, a1, t5
> + vs2r.v v2, (t0) /* store, advance dest */
> + add t0, t0, t5
> + bltu a1, t4, 1b /* src at end? */
> +
> + bltu a2, t2, .Lscalar_fwd /* should we do one more vec load/store? */
> + vl1r.v v2, (a1)
> + sub a2, a2, t2
> + add a1, a1, t2
> + vs1r.v v2, (t0)
> + add t0, t0, t2
> +
> +.Lscalar_fwd:
> + bnez a2, .Lnobail
> +.Lbail:
> + ret
> +.Lnobail:
> +
> +#ifndef __riscv_strict_align
> + addi t2, zero, 4
> + bltu a2, t2, .Lsingle_bytes
> +1:
> + lw t3, 0(a1)
> + addi a1, a1, 4
> + sw t3, 0(t0)
> + addi t0, t0, 4
> + addi a2, a2, -4
> + bgeu a2, t2, 1b
> +#endif // __riscv_strict_align
> +
> +.Lsingle_bytes:
> + beqz a2, .Lbail
> + add a2, a2, a1 /* a2 = src + remaining size */
> +1:
> + lb t1, 0(a1)
> + sb t1, 0(t0)
> + addi a1, a1, 1
> + addi t0, t0, 1
> + bltu a1, a2, 1b
> + ret
> +.size memcpy, .-memcpy
> +
> +
> +.Lmemcpy_rev:
> + beq a0, a1, .Lmemcpy_rev_done
> + add t0, a0, a2 /* t0 = dest so we can return a0=dest later */
> + add t6, a1, a2 /* dest and src both point to byte */
> + /* immediately after end of buffer */
> +
> + csrr t2, vlenb /* t2 = number of bytes per pass */
> + slli t5, t2, 1 /* t5 = number of bytes per entire loop */
> + addi t3, t5, -1 /* t3 = (bytes per loop) mask */
> + not t4, t3 /* generate mask for bytes processed by loop */
> + and t4, a2, t4 /* t4 = bytes copied in vectorised pass */
> +
> + beqz t4, .Lscalar_rev /* size too small for even one pass? */
> +
> + and a2, a2, t3 /* a2 = bytes still left to copy after pass */
> + sub t4, t6, t4 /* t4 = src at end of vectorised pass */
> +
> +1:
> + sub t6, t6, t5
> + sub t0, t0, t5
> + vl2r.v v2, (t6) /* load, advance source */
> + vs2r.v v2, (t0) /* store, advance dest */
> + bgtu t6, t4, 1b /* src at end? */
> +
> + bltu a2, t2, .Lscalar_rev /* should we do one more vec load/store? */
> + sub t6, t6, t2
> + sub t0, t0, t2
> + sub a2, a2, t2
> + vl1r.v v2, (t6)
> + vs1r.v v2, (t0)
> +
> +.Lscalar_rev:
> +#ifndef __riscv_strict_align
> + beqz a2, .Lbail
> +
> + addi t2, zero, 4
> + bltu a2, t2, 2f
> +1:
> + addi t6, t6, -4
> + addi t0, t0, -4
> + addi a2, a2, -4
> + lw t3, 0(t6)
> + sw t3, 0(t0)
> + bgeu a2, t2, 1b
> +2:
> +#endif // __riscv_strict_align
> +
> + beqz a2, .Lbail
> +1:
> + addi t6, t6, -1
> + addi t0, t0, -1
> + lb t1, 0(t6)
> + sb t1, 0(t0)
> + bgtu t0, a0, 1b
> +
> +.Lmemcpy_rev_done:
> + ret
> +
> +.size memmove, .-memmove
> +libc_hidden_builtin_def (memcpy)
> +libc_hidden_builtin_def (memmove)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memmove.c b/sysdeps/riscv/rv64/rvv/memmove.c
> new file mode 100644
> index 0000000000..47734854f9
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memmove.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +/* memmove is implemented in memcpy.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/memset.S b/sysdeps/riscv/rv64/rvv/memset.S
> new file mode 100644
> index 0000000000..6f82c542b1
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memset.S
> @@ -0,0 +1,89 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memset for riscv with vector extension
> + */
> +
> +.globl memset
> +.type memset,@function
> +
> +.align 2
> +memset:
> + mv t0, a0 /* t0 = dest so we can return a0 later */
> + vsetvli t2, a2, e8, m2, ta, ma /* t2 = elts per copy */
> + beqz t2, .Lscalar
> +
> + vmv.v.x v2, a1 /* splat value across v2 */
> +
> + slli t3, t2, 1
> + bgtu t3, a2, .Lsinglestore
> +
> +1:
> + vse8.v v2, (t0)
> + add t0, t0, t2
> + vse8.v v2, (t0)
> + add t0, t0, t2
> + sub a2, a2, t3
> + bgeu a2, t3, 1b
> + bgeu a2, t2, .Lsinglestore
> + bnez a2, .Lscalar
> +
> +.Lbail:
> + ret
> +
> +.Lsinglestore:
> + bgtu t2, a2, .Lscalar
> + vse8.v v2, (t0)
> + add t0, t0, t2
> + sub a2, a2, t2
> +
> +.Lscalar:
> + beqz a2, .Lbail
> +
> +#ifndef __riscv_strict_align
> + slli t2, a1, 8
> + or a1, a1, t2
> + slli t2, a1, 16
> + or a1, a1, t2
> +
> + addi t2, zero, 4
> + bltu a2, t2, 2f
> +
> +1:
> + sw a1, 0(t0)
> + addi t0, t0, 4
> + addi a2, a2, -4
> + bgeu a2, t2, 1b
> +2:
> + beqz a2, .Lbail
> +#endif // __riscv_strict_align
> +
> + add a2, a2, t0
> +1:
> + sb a1, 0(t0)
> + addi t0, t0, 1
> + bltu t0, a2, 1b
> + ret
> +
> +.size memset, .-memset
> +libc_hidden_builtin_def (memset)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strchr.S b/sysdeps/riscv/rv64/rvv/strchr.S
> new file mode 100644
> index 0000000000..0b37174c55
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strchr.S
> @@ -0,0 +1,92 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strchr
> +.type strchr,@function
> +
> +.globl __strchrnul
> +.type __strchrnul,@function
> +
> +/*
> + * optimized strchr for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +__strchrnul:
> + li t5, -1
> + j 1f
> +
> +strchr:
> + mv t5, zero
> +1: csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of pointer */
> + and t2, a0, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search however many bytes
> + are needed to align the pointer */
> + vsetvli t2, t2, e8, m2, ta, mu
> +
> + vle8.v v2, (a0) /* load data into v2(,v3) */
> + vmseq.vx v4, v2, zero
> + vfirst.m t4, v4
> + vmsbf.m v0, v4
> + vmseq.vx v0, v2, a1, v0.t
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + bgez t4, .Lbufferend
> + add a0, a0, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu
> + li t4, -1
> +
> +1:
> + vle8.v v2, (a0)
> + vmseq.vx v4, v2, zero
> + vfirst.m t4, v4
> + vmsbf.m v0, v4
> + vmseq.vx v0, v2, a1, v0.t
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + bgez t4, .Lbufferend
> + add a0, a0, t1
> + j 1b
> +
> +.Lfound: /* found the target at a0+t3 */
> + add a0, a0, t3
> + ret
> +
> +.Lbufferend:
> + add a0, a0, t4
> + and a0, a0, t5
> + ret
> +
> +.size strchr, .-strchr
> +.size __strchrnul, .-__strchrnul
> +
> +libc_hidden_builtin_def (strchr)
> +weak_alias (__strchrnul, strchrnul)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strchrnul.c b/sysdeps/riscv/rv64/rvv/strchrnul.c
> new file mode 100644
> index 0000000000..259da80358
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strchrnul.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +/* strchrnul is implemented in strchr.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strcmp.S b/sysdeps/riscv/rv64/rvv/strcmp.S
> new file mode 100644
> index 0000000000..4a219221ac
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcmp.S
> @@ -0,0 +1,108 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strcmp
> +.type strcmp,@function
> +
> +.align 2
> +
> +/* most of the time, one or both sides is unaligned and their alignments differ
> + * we need to check for a null terminator before crossing a page boundary
> + * strategy:
> + * - for each side, calculate masks for alignment and (vlenb * 2) - alignment
> + * - while no difference encountered:
> + * - for each side:
> + * - load bytes to end of next vlenb*2 block
> + * - check for null terminator
> + * - if no terminator, load bytes to fill rest of register
> + * - compare sides
> + */
> +
> +strcmp:
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + vsetvli zero, t1, e8, m2, ta, mu
> + vid.v v30
> + addi t2, t1, -1 /* mask for unaligned part of ptr */
> + and t6, a0, t2 /* unaligned part of lhs */
> + and t5, a1, t2 /* unaligned part of rhs */
> + sub t6, t1, t6 /* safe number of lhs bytes to read */
> + sub t5, t1, t5 /* same, rhs */
> + vmsltu.vx v28, v30, t6 /* v28 = mask for first half of lhs load */
> + vmsltu.vx v26, v30, t5 /* v26 = mask for first half of rhs load */
> + vmv.v.x v16, zero
> + vmv.v.x v18, zero
> +
> +1: vmv.v.v v0, v28 /* lhs mask */
> + vle8.v v2, (a0), v0.t /* masked load from lhs */
> + vmseq.vx v16, v2, zero, v0.t /* check loaded bytes for null */
> + vmv.v.v v0, v26 /* rhs mask */
> + vfirst.m t2, v16 /* get lhs check result */
> + bgez t2, .Ltail /* bail if we can't safely check rest */
> + vle8.v v4, (a1), v0.t /* masked load from rhs */
> + vmseq.vx v18, v4, zero, v0.t /* check partial rhs for null */
> + vmnot.m v0, v28 /* mask for rest of lhs */
> + vfirst.m t3, v18 /* get check result */
> + bltz t3, 2f /* test it */
> + /* we see null terminator */
> + bge t3, t6, .Ltail /* have enough bytes for vector cmp? */
> +
> + vmsleu.vx v0, v30, t3 /* select rest + null */
> + vmsne.vv v0, v2, v4, v0.t /* compare */
> + vfirst.m t3, v0
> + bgez t3, 3f
> + mv a0, zero /* no difference */
> + ret
> +3: add a0, a0, t3
> + add a1, a1, t3
> + lbu t0, (a0)
> + lbu t1, (a1)
> +.Ldiff:
> + sub a0, t0, t1
> + ret
> +
> + /* ...no null terminator */
> +2: vle8.v v2, (a0), v0.t /* load rest of lhs */
> + vmnot.m v0, v26 /* mask for rest of rhs */
> + vle8.v v4, (a1), v0.t /* load rest of rhs */
> + vmsne.vv v0, v2, v4 /* compare */
> + add a0, a0, t1 /* advance ptrs */
> + vfirst.m t3, v0
> + add a1, a1, t1
> + bltz t3, 1b
> +
> + sub t3, t3, t1 /* found difference but we've already advanced a0 and a1 */
> + j 3b
> +
> +.Ltail:
> + lbu t0, (a0)
> + lbu t1, (a1)
> + bne t0, t1, .Ldiff
> + addi a0, a0, 1
> + addi a1, a1, 1
> + bnez t0, .Ltail
> + mv a0, zero
> + ret
> +
> +
> +.size strcmp, .-strcmp
> +libc_hidden_builtin_def (strcmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S b/sysdeps/riscv/rv64/rvv/strcpy.S
> new file mode 100644
> index 0000000000..b21909d66f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
> @@ -0,0 +1,72 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strcpy
> +.type strcpy,@function
> +
> +/*
> + * optimized strcpy for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strcpy:
> + mv t0, a0 /* copy dest so we can return it */
> +
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> +
> + addi t2, t1, -1 /* mask unaligned part of ptr */
> + and t2, a1, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search enough to align ptr */
> + vsetvli t2, t2, e8, m2, tu, mu
> + vle8.v v2, (a1)
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4 /* copy but not past null */
> + vfirst.m t3, v4
> + vse8.v v2, (t0), v0.t
> + bgez t3, .Ldone
> + add t0, t0, t2
> + add a1, a1, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:
> + vle8.v v2, (a1)
> + add a1, a1, t1
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4
> + vfirst.m t3, v4
> + vse8.v v2, (t0), v0.t
> + add t0, t0, t1
> + bltz t3, 1b
> +
> +.Ldone:
> + ret
> +
> +.size strcpy, .-strcpy
> +libc_hidden_builtin_def (strcpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c b/sysdeps/riscv/rv64/rvv/strcspn.c
> new file mode 100644
> index 0000000000..f0595a72fb
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +/* strcspn is implemented in strspn.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strlen.S b/sysdeps/riscv/rv64/rvv/strlen.S
> new file mode 100644
> index 0000000000..c77d500693
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strlen.S
> @@ -0,0 +1,67 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strlen
> +.type strlen,@function
> +
> +/*
> + * optimized strlen for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strlen:
> + mv t4, a0 /* copy of buffer start */
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, a0, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search fwd to align ptr */
> + vsetvli t2, t2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + add a0, a0, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, ma /* search 2*vlenb bytes per pass */
> + add t4, t4, t1
> +
> +1:
> + vle8.v v2, (a0)
> + add a0, a0, t1
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bltz t3, 1b
> +
> +.Lfound: /* found the 0; subtract */
> + sub a0, a0, t4 /* buffer start from current ptr */
> + add a0, a0, t3 /* and add offset into fetched */
> + ret /* data to get length */
> +
> +.size strlen, .-strlen
> +libc_hidden_builtin_def (strlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S b/sysdeps/riscv/rv64/rvv/strncmp.S
> new file mode 100644
> index 0000000000..863e5cb525
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
> @@ -0,0 +1,104 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strncmp
> +.type strncmp,@function
> +
> +.align 2
> +
> +/* as strcmp, but with added checks on a2 (max count)
> + */
> +
> +strncmp:
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + blt a2, t1, .Ltail /* degrade if max < vlenb*2 */
> + vsetvli zero, t1, e8, m2, ta, mu
> + vid.v v30
> + addi t2, t1, -1 /* mask unaligned part of ptr */
> + and t6, a0, t2 /* unaligned part of lhs */
> + and t5, a1, t2 /* unaligned part of rhs */
> + sub t6, t1, t6 /* safe count to read from lhs */
> + sub t5, t1, t5 /* same, rhs */
> + vmsltu.vx v28, v30, t6 /* mask for first part of lhs */
> + vmsltu.vx v26, v30, t5 /* mask for first part of rhs */
> + vmv.v.x v16, zero
> + vmv.v.x v18, zero
> +
> +
> +1: blt a2, t1, .Ltail
> + vmv.v.v v0, v28 /* lhs mask */
> + vle8.v v2, (a0), v0.t /* masked load from lhs */
> + vmseq.vx v16, v2, zero, v0.t /* check loaded bytes for null */
> + vmv.v.v v0, v26 /* rhs mask */
> + vfirst.m t2, v16 /* get lhs check result */
> + bgez t2, .Ltail /* can we safely check rest */
> + vle8.v v4, (a1), v0.t /* masked load from rhs */
> + vmseq.vx v18, v4, zero, v0.t /* check partial rhs */
> + vmnot.m v0, v28 /* mask for rest of lhs */
> + vfirst.m t3, v18 /* get check result */
> + bltz t3, 2f /* test it */
> + bge t3, t6, .Ltail
> +
> + vmsleu.vx v0, v30, t3 /* select rest of string + null */
> + vmsne.vv v0, v2, v4, v0.t /* compare */
> + vfirst.m t3, v0
> + bgez t3, 3f
> + mv a0, zero
> + ret
> +3: add a0, a0, t3
> + add a1, a1, t3
> + lbu t0, (a0)
> + lbu t1, (a1)
> +.Ldiff:
> + sub a0, t0, t1
> + ret
> +
> + /* ...no null terminator in first part of lhs or rhs */
> +2: vle8.v v2, (a0), v0.t /* load rest of lhs */
> + vmnot.m v0, v26 /* mask for rest of rhs */
> + vle8.v v4, (a1), v0.t /* load rest of rhs */
> + vmsne.vv v0, v2, v4 /* compare */
> + add a0, a0, t1 /* advance ptrs */
> + vfirst.m t3, v0
> + add a1, a1, t1
> + sub a2, a2, t1
> + bltz t3, 1b
> +
> + sub t3, t3, t1 /* found a diff but we've already advanced a0 and a1 */
> + j 3b
> +
> +.Ltail:
> + beqz a2, 1f
> + addi a2, a2, -1
> + lbu t0, (a0)
> + lbu t1, (a1)
> + bne t0, t1, .Ldiff
> + addi a0, a0, 1
> + addi a1, a1, 1
> + bnez t0, .Ltail
> +1: mv a0, zero
> + ret
> +
> +
> +.size strncmp, .-strncmp
> +libc_hidden_builtin_def (strncmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S b/sysdeps/riscv/rv64/rvv/strncpy.S
> new file mode 100644
> index 0000000000..8b3a1e545c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
> @@ -0,0 +1,96 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strncpy
> +.type strncpy,@function
> +
> +/*
> + * optimized strcpy for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strncpy:
> + mv t0, a0 /* need to return dest so copy */
> +
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> +
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, a1, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search to align the pointer */
> + vsetvli zero, t2, e8, m2, tu, mu
> + vle8.v v2, (a1)
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4 /* copy to dest */
> + vfirst.m t3, v4
> + bgeu t2, a2, .Ldest_full
> + vse8.v v2, (t0), v0.t
> + bgez t3, .Lterminator_found
> + add t0, t0, t2
> + add a1, a1, t2
> + sub a2, a2, t2
> + beqz a2, .Ldone
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:
> + vle8.v v2, (a1)
> + add a1, a1, t1
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4
> + vfirst.m t3, v4
> + bgeu t1, a2, .Ldest_full
> + vse8.v v2, (t0), v0.t
> + add t0, t0, t1
> + sub a2, a2, t1
> + bltz t3, 1b
> + sub t0, t0, t1
> +
> +.Lterminator_found:
> + addi sp, sp, -16
> + sd ra, 0(sp)
> + sd a0, 8(sp)
> + add a0, t0, t3
> + mv a1, zero
> + sub a2, a2, t3
> + jal ra, memset
> + ld ra, 0(sp)
> + ld a0, 8(sp)
> + addi sp, sp, 16
> +.Ldone:
> + ret
> +
> +.Ldest_full:
> + vid.v v6
> + vmsltu.vx v4, v6, a2
> + vmand.mm v0, v0, v4
> + vse8.v v2, (t0), v0.t
> + ret
> +
> +.size strncpy, .-strncpy
> +libc_hidden_builtin_def (strncpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S b/sysdeps/riscv/rv64/rvv/strnlen.S
> new file mode 100644
> index 0000000000..6d7ee65c7a
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
> @@ -0,0 +1,81 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl __strnlen
> +.type __strnlen,@function
> +
> +/* vector optimized strnlen
> + * assume it's safe to read to the end of the page
> + * containing either a null terminator or the last byte of the count or both,
> + * but not past it
> + * assume page size >= vlenb*2
> + */
> +
> +.align 2
> +__strnlen:
> + mv t4, a0 /* stash a copy of start for later */
> + beqz a1, .LzeroCount
> +
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, a1, a0
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search to align pointer to t1 */
> + bgeu t2, a1, 2f /* check it's safe */
> + mv t2, a1 /* it's not! look as far as permitted */
> +2: vsetvli t2, t2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + add a0, a0, t2
> + sub a1, a1, t2
> + bltu a1, t1, .LreachedCount
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, ma /* do 2*vlenb bytes per pass */
> +
> +1: vle8.v v2, (a0)
> + sub a1, a1, t1
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + add a0, a0, t1
> + bgeu a1, t1, 1b
> +.LreachedCount:
> + mv t2, a1 /* in case 0 < a1 < t1 */
> + bnez a1, 2b /* if so, still t2 bytes to check, all safe */
> +.LzeroCount:
> + sub a0, a0, t4
> + ret
> +
> +.Lfound: /* found the 0; subtract buffer start from current pointer */
> + add a0, a0, t3 /* and add offset into fetched data */
> + sub a0, a0, t4
> + ret
> +
> +.size __strnlen, .-__strnlen
> +weak_alias (__strnlen, strnlen)
> +libc_hidden_builtin_def (__strnlen)
> +libc_hidden_builtin_def (strnlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S b/sysdeps/riscv/rv64/rvv/strrchr.S
> new file mode 100644
> index 0000000000..4bef8a3b9c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
> @@ -0,0 +1,88 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strrchr
> +.type strrchr,@function
> +
> +/*
> + * optimized strrchr for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strrchr:
> + mv t5, a0 /* stash buffer ptr somewhere safe */
> + mv a0, zero /* result is nullptr unless we find better below */
> +
> + csrr t1, vlenb /* determine vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, t5, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search to align ptr to 2*vlenb */
> + vsetvli t2, t2, e8, m2, ta, mu
> +
> + vle8.v v2, (t5) /* load data into v2(,v3) */
> + vmseq.vx v4, v2, zero /* check for null terminator */
> + vfirst.m t4, v4 /* grab its position, if any */
> + vmsbf.m v0, v4 /* select valid chars */
> + vmseq.vx v0, v2, a1, v0.t /* search for candidate byte */
> + vfirst.m t3, v0 /* grab its position, if any */
> + bltz t3, 2f /* did we find a candidate? */
> +
> +3: add a0, t3, t5 /* we did! grab the address */
> + vmsof.m v1, v0 /* there might be more than one */
> + vmandn.mm v0, v0, v1 /* so clear the one we just found */
> + vfirst.m t3, v0 /* is there another? */
> + bgez t3, 3b
> +
> +2: bgez t4, .Ldone /* did we see a null terminator? */
> + add t5, t5, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1: vle8.v v2, (t5)
> + vmseq.vx v4, v2, zero
> + vfirst.m t4, v4
> + vmsbf.m v0, v4
> + vmseq.vx v0, v2, a1, v0.t
> + vfirst.m t3, v0
> + bltz t3, 2f
> +
> +3: add a0, t3, t5
> + vmsof.m v1, v0
> + vmandn.mm v0, v0, v1
> + vfirst.m t3, v0
> + bgez t3, 3b
> +
> +2: add t5, t5, t1
> + bltz t4, 1b
> +
> +.Ldone:
> + ret
> +
> +.size strrchr, .-strrchr
> +libc_hidden_builtin_def (strrchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strspn.S b/sysdeps/riscv/rv64/rvv/strspn.S
> new file mode 100644
> index 0000000000..2b9af5cc2d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strspn.S
> @@ -0,0 +1,189 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strspn
> +.type strspn,@function
> +
> +.globl strcspn
> +.type strcspn,@function
> +
> +/*
> + * optimized strspn / strcspn for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 32
> + * strategy:
> + * - build a 256-bit table on the stack, where each elt is zero
> + * if encountering it should terminate computation and nonzero otherwise
> + * - use vectorised lookups into this to check 2*vlen elts at a time;
> + * this code is identical for strspan and strcspan and can be shared
> + *
> + * note that while V mandates at least 128 bit wide regs,
> + * we are building a 256 bit lookup table
> + * therefore we use either LMUL=1 or 2 depending on what the target supports
> + * therefore we only use even vector register numbers,
> + * so everything still works if we go with LMUL=2
> + */
> +
> +# -----------------------------
> +
> +.align 2
> +
> +strspn:
> + lbu t0, 0(a1)
> + bnez t0, .Lbuild_table
> + mv a0, zero
> + ret
> +
> +.Lbuild_table:
> + mv a6, a0 /* store incoming a0 */
> + li t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
> +
> + vsetvli zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> + /* we want to build a 256-bit table, so use vlenb*2,
> + * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
> + * 'V' extension specifies a minimum vlen of 128 so this should cover
> + * all cases; we can skip the check if we know vlen >= 256 at compile time
> + */
> + csrr t2, vlenb
> + bgeu t2, t1, 1f
> + vsetvli zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> + /* read one char from the charset at a time and write the correct bit
> + * in the lookup table; we could do SIMD iff we ever get an extension
> + * that provides some way of scattering bytes into a reg group
> + */
> + vmv.v.x v16, zero /* clear out table */
> + vmv.v.x v8, zero /* clear out v8 */
> + li t3, 1
> + vmv.s.x v8, t3 /* v8 now all zeroes except bottom byte */
> +
> +1: vmv.v.x v2, zero /* clear out v2 */
> + addi a1, a1, 1 /* advance charset ptr */
> + srli t2, t0, 3 /* divide the byte we read earlier by 8 */
> + vslideup.vx v2, v8, t2 /* v2 now 1 in the correct byte 0 elsewhere */
> + vsll.vx v2, v2, t0 /* v2 now 1 in the correct bit, 0 elsewhere */
> + vor.vv v16, v16, v2 /* or it in */
> + lbu t0, 0(a1) /* fetch next bute */
> + bnez t0, 1b /* if it's null, go round again */
> +
> +/*
> + * Table is now built in v16.
> + * Strategy:
> + * - fetch next t1 bytes from memory
> + * - vrgather on their values divided by 8 to get relevant bytes of table
> + * - shift right to get the correct bit into bit 1
> + * - and with 1, compare with expected terminator value, then check mask
> + * to see if we've found a terminator
> + *
> + * Before we can begin, a0 needs to be t1-aligned, so that when we fetch
> + * the next t1 bytes - any of which may be the null terminator -
> + * we do not cross a page boundary and read unmapped memory. Therefore
> + * we have one read of however many bytes are needed to align a0,
> + * before the main loop.
> + */
> +
> +.Lscan_table:
> + vmv.v.x v8, t3 /* v8 now t1 bytes of 0x01 */
> +
> + and t2, a0, t1 /* mask to align to t1 */
> + beqz t2, 2f /* or skip if we're already aligned */
> + sub t2, t1, t2 /* t2 now bytes to read to align to t1 */
> +
> + vid.v v2 /* build mask instead of changing vl */
> + vmsltu.vx v0, v2, t2 /* so we don't need to track LMUL */
> +
> + vle8.v v2, (a0), v0.t /* load next bytes from input */
> + vsrl.vi v4, v2, 3 /* divide by 8 */
> + vrgather.vv v6, v16, v4 /* corresponding bytes of bit table */
> + vsrl.vv v6, v6, v2 /* shift correct bits to lsb */
> + vand.vv v6, v6, v8 /* and with 1 to complete the lookups */
> + vmseq.vx v4, v6, zero, v0.t /* check to see if any 0s are present */
> + vfirst.m t0, v4 /* index of the first 0, if any */
> + bgez t0, .Lscan_end /* if we found one, stop */
> + add a0, a0, t2 /* advance by number of bytes we read */
> +
> +2: add a6, a6, t1 /* we'll advance a0 before the exit check */
> +1: vle8.v v2, (a0) /* as above but unmasked so t1 elts per pass */
> + add a0, a0, t1
> +
> + vsrl.vi v4, v2, 3
> + vrgather.vv v6, v16, v4
> + vsrl.vv v6, v6, v2
> + vand.vv v6, v6, v8
> +
> + vmseq.vx v4, v6, zero
> + vfirst.m t0, v4
> + bltz t0, 1b
> +
> +.Lscan_end:
> + add a0, a0, t0 /* calculate offset to terminating byte */
> + sub a0, a0, a6
> + ret
> +.size strspn, .-strspn
> +
> +/* strcspn
> + *
> + * table build exactly as for strspn, except:
> + * - the lookup table starts with all bits except bit 0 of byte 0 set
> + * - we clear the corresponding bit for each byte in the charset
> + * once table is built, we can reuse the scan code directly
> + */
> +
> +strcspn:
> + lbu t0, 0(a1)
> + beqz t0, strlen /* no rejections -> prefix is whole string */
> +
> + mv a6, a0
> + li t1, 32
> +
> + vsetvli zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> + csrr t2, vlenb
> + bgeu t2, t1, 1f
> + vsetvli zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> + vmv.v.x v8, zero
> + li t3, 1 /* all bits clear except bit 0 of byte 0 */
> + vmv.s.x v8, t3
> + vnot.v v16, v8 /* v16 is the inverse of that */
> + li t4, -1
> +
> +1: vmv.v.x v2, zero
> + addi a1, a1, 1 /* advance charset ptr */
> + srli t2, t0, 3 /* select correct bit in v2 */
> + vslideup.vx v2, v8, t2
> + vsll.vx v2, v2, t0
> + vnot.v v2, v2 /* invert */
> + vand.vv v16, v16, v2 /* clear the relevant bit of table */
> + lbu t0, 0(a1)
> + bnez t0, 1b
> + j .Lscan_table
> +.size strcspn, .-strcspn
> +
> +libc_hidden_builtin_def (strspn)
> +libc_hidden_builtin_def (strcspn)
> \ No newline at end of file
> --
> 2.34.1
>
On Wed, Feb 1, 2023 at 1:54 AM Sergei Lewis <slewis@rivosinc.com> wrote:
>
> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> targeting the riscv "V" extension, version 1.0
>
> The vectorised implementations assume VLENB of at least 128 and at least 32
> registers (as mandated by the "V" extension spec). They also assume that
> VLENB is a power of two which is no larger than the page size, and (as
> vectorised code in glibc for other platforms does) that it is safe to read
> past null terminators / buffer ends provided one does not cross a page
> boundary.
>
> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
> ---
> sysdeps/riscv/rv64/rvv/Implies | 2 +
> sysdeps/riscv/rv64/rvv/memchr.S | 127 +++++++++++++++++++
> sysdeps/riscv/rv64/rvv/memcmp.S | 93 ++++++++++++++
> sysdeps/riscv/rv64/rvv/memcpy.S | 154 +++++++++++++++++++++++
> sysdeps/riscv/rv64/rvv/memmove.c | 22 ++++
> sysdeps/riscv/rv64/rvv/memset.S | 89 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strchr.S | 92 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strchrnul.c | 22 ++++
> sysdeps/riscv/rv64/rvv/strcmp.S | 108 +++++++++++++++++
> sysdeps/riscv/rv64/rvv/strcpy.S | 72 +++++++++++
> sysdeps/riscv/rv64/rvv/strcspn.c | 22 ++++
> sysdeps/riscv/rv64/rvv/strlen.S | 67 ++++++++++
> sysdeps/riscv/rv64/rvv/strncmp.S | 104 ++++++++++++++++
> sysdeps/riscv/rv64/rvv/strncpy.S | 96 +++++++++++++++
> sysdeps/riscv/rv64/rvv/strnlen.S | 81 +++++++++++++
> sysdeps/riscv/rv64/rvv/strrchr.S | 88 ++++++++++++++
> sysdeps/riscv/rv64/rvv/strspn.S | 189 +++++++++++++++++++++++++++++
> 17 files changed, 1428 insertions(+)
> create mode 100644 sysdeps/riscv/rv64/rvv/Implies
> create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
> create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
> create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
> create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
> create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
> create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
> create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
> create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
>
> diff --git a/sysdeps/riscv/rv64/rvv/Implies b/sysdeps/riscv/rv64/rvv/Implies
> new file mode 100644
> index 0000000000..b07b4cb906
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/Implies
> @@ -0,0 +1,2 @@
> +riscv/rv64/rvd
> +
> diff --git a/sysdeps/riscv/rv64/rvv/memchr.S b/sysdeps/riscv/rv64/rvv/memchr.S
> new file mode 100644
> index 0000000000..a7e32b8f25
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memchr.S
> @@ -0,0 +1,127 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memchr for riscv with vector extension
> + * Assumptions:
> + * - cpu becomes bandwidth limited at or before
> + * 2 vector register sized read/write operations
> + * + 2 scalar operations
> + * + conditional branch
> + */
> +
> +.globl memchr
> +.type memchr,@function
> +
> +.align 2
> +memchr:
> + beqz a2, .Lnot_found
> + csrr t1, vlenb
> + bgeu a2, t1, .Lvector_path /* only use vector path if we're scanning
> + at least vlenb bytes */
> +
> +#ifndef __riscv_strict_align
strict-align is not the right thing to check here. As the RVA profile
document explains, all RVA-compliant implementations must support
misaligned loads and stores (so strict-align will be false), but they
might execute extremely slowly (e.g., via trap and emulate), and so
this approach will unduly penalize some implementations.
This is another justification for simply sticking with the generic routines.
> + li a3, 8
> + blt a2, a3, .Lbytewise
> +
> + li t1, 0x0101010101010101
> + slli a4, t1, 7 /* a4 = 0x8080808080808080 */
> + mul t2, a1, t1 /* entirety of t2 is now repeats of target character;
> + assume mul is at worst no worse than 3*(shift+OR),
> + otherwise do that instead */
> +
> +/*
> + * strategy:
> + * t4 = ((*a0) ^ t2)
> + * - now t4 contains zero bytes if and only if next word of memory
> + * had target character at those positions
> + *
> + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
> + * - all nonzero bytes of t4 become 0; zero bytes become 0x80
> + *
> + * if t4 is nonzero, find the index of the byte within it, add to a0 and return
> + * otherwise, loop
> + */
> +
> +1:
> + ld t4, (a0) /* t4 = load next 8 bytes */
> + xor t4, t4, t2
> + sub t5, t4, t1
> + not t4, t4
> + and t4, t5, t4
> + and t4, t4, a4
> + bnez t4, .Lbytewise /* could use ctzw, mod+lookup or just binary chop
> + to locate byte of interest in t4 but profiling
> + shows these approaches are at best no better */
> + addi a2, a2, -8
> + addi a0, a0, 8
> + bgeu a2, a3, 1b
> + beqz a2, .Lnot_found
> +#endif // __riscv_strict_align
> +
> +/* too little data for a dword. mask calculation and branch mispredict costs
> + make checking a word not worthwhile. degrade to bytewise search. */
> +
> +.Lbytewise:
> + add t2, a0, a2
> +
> +1:
> + lb t1, (a0)
> + beq t1, a1, .Lfound
> + addi a0, a0, 1
> + blt a0, t2, 1b
> +
> +.Lnot_found:
> + mv a0, zero
> +.Lfound:
> + ret
> +
> +.Lvector_path:
> + vsetvli t2, a2, e8, m2, ta, ma
> +
> +1:
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, a1
> + vfirst.m t3, v0
> + bgez t3, .Lvec_found
> + add a0, a0, t2
> + sub a2, a2, t2
> + bge a2, t2, 1b
> + bnez a2, 2f
> + mv a0, zero
> + ret
> +
> +2:
> + vsetvli t2, a2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, a1
> + vfirst.m t3, v0
> + bgez t3, .Lvec_found
> + mv a0, zero
> + ret
> +
> +.Lvec_found:
> + add a0, a0, t3
> + ret
> +
> +.size memchr, .-memchr
> +libc_hidden_builtin_def (memchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memcmp.S b/sysdeps/riscv/rv64/rvv/memcmp.S
> new file mode 100644
> index 0000000000..a945753a5f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memcmp.S
> @@ -0,0 +1,93 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +/* Optimised memcmp for riscv with vector extension
> + */
> +
> +.globl memcmp
> +.type memcmp,@function
> +
> +.align 2
> +
> +memcmp:
> + mv t2, zero
> + beqz a2, .Ldone
> +
> + li t1, 5 /* scalar path cheaper for 1-4 elts */
> + bltu a2, t1, .Lscalar
> +
> + /* main loop, vlenb*2 elts at a time */
> + vsetvli t1, a2, e8, m2, ta, ma
> +
> +1:
> + vle8.v v2, (a0) /* load elts */
> + vle8.v v4, (a1)
> + vmsne.vv v0, v2, v4 /* compare */
> + vfirst.m t3, v0
> + bgez t3, .Lvec_diff /* found a difference ? */
> + add a0, a0, t1 /* not yet, advance everything */
> + add a1, a1, t1
> + sub a2, a2, t1
> + bgeu a2, t1, 1b
> +
> + bnez a2, .Ltail
> + mv a0, zero
> + ret
> +
> +.Ltail:
> + /* handle tail. we know a2 < vlenb*2 so just load and compare the lot */
> + vsetvli t1, a2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vle8.v v4, (a1)
> + vmsne.vv v0, v2, v4
> + vfirst.m t3, v0
> + bgez t3, .Lvec_diff
> + mv a0, zero /* no diff found */
> + ret
> +
> +.Lvec_diff: /* v2, v4 differ at elt t3 */
> + add a0, a0, t3
> + add a1, a1, t3
> + lbu t0, (a0)
> + lbu t1, (a1)
> + sub a0, t0, t1
> + ret
> +
> +.Lscalar:
> + add t3, a0, a2
> +
> +1:
> + lbu t0, (a0)
> + lbu t1, (a1)
> + sub t2, t0, t1
> + bnez t2, .Ldone
> + addi a0, a0, 1
> + addi a1, a1, 1
> + bltu a0, t3, 1b
> +
> +.Ldone:
> + mv a0, t2
> + ret
> +
> +
> +.size memcmp, .-memcmp
> +libc_hidden_builtin_def (memcmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memcpy.S b/sysdeps/riscv/rv64/rvv/memcpy.S
> new file mode 100644
> index 0000000000..7b37ec285d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memcpy.S
> @@ -0,0 +1,154 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +/* Optimised memcpy and memmove for riscv with vector extension
> + */
> +
> +.globl memcpy
> +.type memcpy,@function
> +.globl memmove
> +.type memmove,@function
> +
> +.align 2
> +memmove:
> + bge a0, a1, .Lmemcpy_rev
> +
> +memcpy:
> +.Lmemcpy_fwd:
> + mv t0, a0 /* t0 = preserve a0 so we can return it */
> + csrr t2, vlenb /* t2 = number of bytes per vectorised copy op */
> + slli t5, t2, 1 /* t5 = number of bytes per loop */
> + addi t3, t5, -1 /* generate mask */
> + not t4, t3
> + and t4, a2, t4 /* t4 = bytes copied in vectorised pass */
> +
> + beqz t4, .Lscalar_fwd /* size too small for even one pass? */
> +
> + and a2, a2, t3 /* a2 = bytes still left to copy after pass */
> + add t4, t4, a1 /* t4 = src at end of vectorised pass */
> +
> +1:
> + vl2r.v v2, (a1) /* load, advance source */
> + add a1, a1, t5
> + vs2r.v v2, (t0) /* store, advance dest */
> + add t0, t0, t5
> + bltu a1, t4, 1b /* src at end? */
> +
> + bltu a2, t2, .Lscalar_fwd /* should we do one more vec load/store? */
> + vl1r.v v2, (a1)
> + sub a2, a2, t2
> + add a1, a1, t2
> + vs1r.v v2, (t0)
> + add t0, t0, t2
> +
> +.Lscalar_fwd:
> + bnez a2, .Lnobail
> +.Lbail:
> + ret
> +.Lnobail:
> +
> +#ifndef __riscv_strict_align
> + addi t2, zero, 4
> + bltu a2, t2, .Lsingle_bytes
> +1:
> + lw t3, 0(a1)
> + addi a1, a1, 4
> + sw t3, 0(t0)
> + addi t0, t0, 4
> + addi a2, a2, -4
> + bgeu a2, t2, 1b
> +#endif // __riscv_strict_align
> +
> +.Lsingle_bytes:
> + beqz a2, .Lbail
> + add a2, a2, a1 /* a2 = src + remaining size */
> +1:
> + lb t1, 0(a1)
> + sb t1, 0(t0)
> + addi a1, a1, 1
> + addi t0, t0, 1
> + bltu a1, a2, 1b
> + ret
> +.size memcpy, .-memcpy
> +
> +
> +.Lmemcpy_rev:
> + beq a0, a1, .Lmemcpy_rev_done
> + add t0, a0, a2 /* t0 = dest so we can return a0=dest later */
> + add t6, a1, a2 /* dest and src both point to byte */
> + /* immediately after end of buffer */
> +
> + csrr t2, vlenb /* t2 = number of bytes per pass */
> + slli t5, t2, 1 /* t5 = number of bytes per entire loop */
> + addi t3, t5, -1 /* t3 = (bytes per loop) mask */
> + not t4, t3 /* generate mask for bytes processed by loop */
> + and t4, a2, t4 /* t4 = bytes copied in vectorised pass */
> +
> + beqz t4, .Lscalar_rev /* size too small for even one pass? */
> +
> + and a2, a2, t3 /* a2 = bytes still left to copy after pass */
> + sub t4, t6, t4 /* t4 = src at end of vectorised pass */
> +
> +1:
> + sub t6, t6, t5
> + sub t0, t0, t5
> + vl2r.v v2, (t6) /* load, advance source */
> + vs2r.v v2, (t0) /* store, advance dest */
> + bgtu t6, t4, 1b /* src at end? */
> +
> + bltu a2, t2, .Lscalar_rev /* should we do one more vec load/store? */
> + sub t6, t6, t2
> + sub t0, t0, t2
> + sub a2, a2, t2
> + vl1r.v v2, (t6)
> + vs1r.v v2, (t0)
> +
> +.Lscalar_rev:
> +#ifndef __riscv_strict_align
> + beqz a2, .Lbail
> +
> + addi t2, zero, 4
> + bltu a2, t2, 2f
> +1:
> + addi t6, t6, -4
> + addi t0, t0, -4
> + addi a2, a2, -4
> + lw t3, 0(t6)
> + sw t3, 0(t0)
> + bgeu a2, t2, 1b
> +2:
> +#endif // __riscv_strict_align
> +
> + beqz a2, .Lbail
> +1:
> + addi t6, t6, -1
> + addi t0, t0, -1
> + lb t1, 0(t6)
> + sb t1, 0(t0)
> + bgtu t0, a0, 1b
> +
> +.Lmemcpy_rev_done:
> + ret
> +
> +.size memmove, .-memmove
> +libc_hidden_builtin_def (memcpy)
> +libc_hidden_builtin_def (memmove)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memmove.c b/sysdeps/riscv/rv64/rvv/memmove.c
> new file mode 100644
> index 0000000000..47734854f9
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memmove.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +/* memmove is implemented in memcpy.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/memset.S b/sysdeps/riscv/rv64/rvv/memset.S
> new file mode 100644
> index 0000000000..6f82c542b1
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memset.S
> @@ -0,0 +1,89 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memset for riscv with vector extension
> + */
> +
> +.globl memset
> +.type memset,@function
> +
> +.align 2
> +memset:
> + mv t0, a0 /* t0 = dest so we can return a0 later */
> + vsetvli t2, a2, e8, m2, ta, ma /* t2 = elts per copy */
> + beqz t2, .Lscalar
> +
> + vmv.v.x v2, a1 /* splat value across v2 */
> +
> + slli t3, t2, 1
> + bgtu t3, a2, .Lsinglestore
> +
> +1:
> + vse8.v v2, (t0)
> + add t0, t0, t2
> + vse8.v v2, (t0)
> + add t0, t0, t2
> + sub a2, a2, t3
> + bgeu a2, t3, 1b
> + bgeu a2, t2, .Lsinglestore
> + bnez a2, .Lscalar
> +
> +.Lbail:
> + ret
> +
> +.Lsinglestore:
> + bgtu t2, a2, .Lscalar
> + vse8.v v2, (t0)
> + add t0, t0, t2
> + sub a2, a2, t2
> +
> +.Lscalar:
> + beqz a2, .Lbail
> +
> +#ifndef __riscv_strict_align
> + slli t2, a1, 8
> + or a1, a1, t2
> + slli t2, a1, 16
> + or a1, a1, t2
> +
> + addi t2, zero, 4
> + bltu a2, t2, 2f
> +
> +1:
> + sw a1, 0(t0)
> + addi t0, t0, 4
> + addi a2, a2, -4
> + bgeu a2, t2, 1b
> +2:
> + beqz a2, .Lbail
> +#endif // __riscv_strict_align
> +
> + add a2, a2, t0
> +1:
> + sb a1, 0(t0)
> + addi t0, t0, 1
> + bltu t0, a2, 1b
> + ret
> +
> +.size memset, .-memset
> +libc_hidden_builtin_def (memset)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strchr.S b/sysdeps/riscv/rv64/rvv/strchr.S
> new file mode 100644
> index 0000000000..0b37174c55
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strchr.S
> @@ -0,0 +1,92 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strchr
> +.type strchr,@function
> +
> +.globl __strchrnul
> +.type __strchrnul,@function
> +
> +/*
> + * optimized strchr for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +__strchrnul:
> + li t5, -1
> + j 1f
> +
> +strchr:
> + mv t5, zero
> +1: csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of pointer */
> + and t2, a0, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search however many bytes
> + are needed to align the pointer */
> + vsetvli t2, t2, e8, m2, ta, mu
> +
> + vle8.v v2, (a0) /* load data into v2(,v3) */
> + vmseq.vx v4, v2, zero
> + vfirst.m t4, v4
> + vmsbf.m v0, v4
> + vmseq.vx v0, v2, a1, v0.t
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + bgez t4, .Lbufferend
> + add a0, a0, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu
> + li t4, -1
> +
> +1:
> + vle8.v v2, (a0)
> + vmseq.vx v4, v2, zero
> + vfirst.m t4, v4
> + vmsbf.m v0, v4
> + vmseq.vx v0, v2, a1, v0.t
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + bgez t4, .Lbufferend
> + add a0, a0, t1
> + j 1b
> +
> +.Lfound: /* found the target at a0+t3 */
> + add a0, a0, t3
> + ret
> +
> +.Lbufferend:
> + add a0, a0, t4
> + and a0, a0, t5
> + ret
> +
> +.size strchr, .-strchr
> +.size __strchrnul, .-__strchrnul
> +
> +libc_hidden_builtin_def (strchr)
> +weak_alias (__strchrnul, strchrnul)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strchrnul.c b/sysdeps/riscv/rv64/rvv/strchrnul.c
> new file mode 100644
> index 0000000000..259da80358
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strchrnul.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +/* strchrnul is implemented in strchr.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strcmp.S b/sysdeps/riscv/rv64/rvv/strcmp.S
> new file mode 100644
> index 0000000000..4a219221ac
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcmp.S
> @@ -0,0 +1,108 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strcmp
> +.type strcmp,@function
> +
> +.align 2
> +
> +/* most of the time, one or both sides is unaligned and their alignments differ
> + * we need to check for a null terminator before crossing a page boundary
> + * strategy:
> + * - for each side, calculate masks for alignment and (vlenb * 2) - alignment
> + * - while no difference encountered:
> + * - for each side:
> + * - load bytes to end of next vlenb*2 block
> + * - check for null terminator
> + * - if no terminator, load bytes to fill rest of register
> + * - compare sides
> + */
> +
> +strcmp:
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + vsetvli zero, t1, e8, m2, ta, mu
> + vid.v v30
> + addi t2, t1, -1 /* mask for unaligned part of ptr */
> + and t6, a0, t2 /* unaligned part of lhs */
> + and t5, a1, t2 /* unaligned part of rhs */
> + sub t6, t1, t6 /* safe number of lhs bytes to read */
> + sub t5, t1, t5 /* same, rhs */
> + vmsltu.vx v28, v30, t6 /* v28 = mask for first half of lhs load */
> + vmsltu.vx v26, v30, t5 /* v26 = mask for first half of rhs load */
> + vmv.v.x v16, zero
> + vmv.v.x v18, zero
> +
> +1: vmv.v.v v0, v28 /* lhs mask */
> + vle8.v v2, (a0), v0.t /* masked load from lhs */
> + vmseq.vx v16, v2, zero, v0.t /* check loaded bytes for null */
> + vmv.v.v v0, v26 /* rhs mask */
> + vfirst.m t2, v16 /* get lhs check result */
> + bgez t2, .Ltail /* bail if we can't safely check rest */
> + vle8.v v4, (a1), v0.t /* masked load from rhs */
> + vmseq.vx v18, v4, zero, v0.t /* check partial rhs for null */
> + vmnot.m v0, v28 /* mask for rest of lhs */
> + vfirst.m t3, v18 /* get check result */
> + bltz t3, 2f /* test it */
> + /* we see null terminator */
> + bge t3, t6, .Ltail /* have enough bytes for vector cmp? */
> +
> + vmsleu.vx v0, v30, t3 /* select rest + null */
> + vmsne.vv v0, v2, v4, v0.t /* compare */
> + vfirst.m t3, v0
> + bgez t3, 3f
> + mv a0, zero /* no difference */
> + ret
> +3: add a0, a0, t3
> + add a1, a1, t3
> + lbu t0, (a0)
> + lbu t1, (a1)
> +.Ldiff:
> + sub a0, t0, t1
> + ret
> +
> + /* ...no null terminator */
> +2: vle8.v v2, (a0), v0.t /* load rest of lhs */
> + vmnot.m v0, v26 /* mask for rest of rhs */
> + vle8.v v4, (a1), v0.t /* load rest of rhs */
> + vmsne.vv v0, v2, v4 /* compare */
> + add a0, a0, t1 /* advance ptrs */
> + vfirst.m t3, v0
> + add a1, a1, t1
> + bltz t3, 1b
> +
> + sub t3, t3, t1 /* found difference but we've already advanced a0 and a1 */
> + j 3b
> +
> +.Ltail:
> + lbu t0, (a0)
> + lbu t1, (a1)
> + bne t0, t1, .Ldiff
> + addi a0, a0, 1
> + addi a1, a1, 1
> + bnez t0, .Ltail
> + mv a0, zero
> + ret
> +
> +
> +.size strcmp, .-strcmp
> +libc_hidden_builtin_def (strcmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S b/sysdeps/riscv/rv64/rvv/strcpy.S
> new file mode 100644
> index 0000000000..b21909d66f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
> @@ -0,0 +1,72 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strcpy
> +.type strcpy,@function
> +
> +/*
> + * optimized strcpy for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strcpy:
> + mv t0, a0 /* copy dest so we can return it */
> +
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> +
> + addi t2, t1, -1 /* mask unaligned part of ptr */
> + and t2, a1, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search enough to align ptr */
> + vsetvli t2, t2, e8, m2, tu, mu
> + vle8.v v2, (a1)
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4 /* copy but not past null */
> + vfirst.m t3, v4
> + vse8.v v2, (t0), v0.t
> + bgez t3, .Ldone
> + add t0, t0, t2
> + add a1, a1, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:
> + vle8.v v2, (a1)
> + add a1, a1, t1
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4
> + vfirst.m t3, v4
> + vse8.v v2, (t0), v0.t
> + add t0, t0, t1
> + bltz t3, 1b
> +
> +.Ldone:
> + ret
> +
> +.size strcpy, .-strcpy
> +libc_hidden_builtin_def (strcpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c b/sysdeps/riscv/rv64/rvv/strcspn.c
> new file mode 100644
> index 0000000000..f0595a72fb
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +/* strcspn is implemented in strspn.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strlen.S b/sysdeps/riscv/rv64/rvv/strlen.S
> new file mode 100644
> index 0000000000..c77d500693
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strlen.S
> @@ -0,0 +1,67 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strlen
> +.type strlen,@function
> +
> +/*
> + * optimized strlen for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strlen:
> + mv t4, a0 /* copy of buffer start */
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, a0, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search fwd to align ptr */
> + vsetvli t2, t2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + add a0, a0, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, ma /* search 2*vlenb bytes per pass */
> + add t4, t4, t1
> +
> +1:
> + vle8.v v2, (a0)
> + add a0, a0, t1
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bltz t3, 1b
> +
> +.Lfound: /* found the 0; subtract */
> + sub a0, a0, t4 /* buffer start from current ptr */
> + add a0, a0, t3 /* and add offset into fetched */
> + ret /* data to get length */
> +
> +.size strlen, .-strlen
> +libc_hidden_builtin_def (strlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S b/sysdeps/riscv/rv64/rvv/strncmp.S
> new file mode 100644
> index 0000000000..863e5cb525
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
> @@ -0,0 +1,104 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strncmp
> +.type strncmp,@function
> +
> +.align 2
> +
> +/* as strcmp, but with added checks on a2 (max count)
> + */
> +
> +strncmp:
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + blt a2, t1, .Ltail /* degrade if max < vlenb*2 */
> + vsetvli zero, t1, e8, m2, ta, mu
> + vid.v v30
> + addi t2, t1, -1 /* mask unaligned part of ptr */
> + and t6, a0, t2 /* unaligned part of lhs */
> + and t5, a1, t2 /* unaligned part of rhs */
> + sub t6, t1, t6 /* safe count to read from lhs */
> + sub t5, t1, t5 /* same, rhs */
> + vmsltu.vx v28, v30, t6 /* mask for first part of lhs */
> + vmsltu.vx v26, v30, t5 /* mask for first part of rhs */
> + vmv.v.x v16, zero
> + vmv.v.x v18, zero
> +
> +
> +1: blt a2, t1, .Ltail
> + vmv.v.v v0, v28 /* lhs mask */
> + vle8.v v2, (a0), v0.t /* masked load from lhs */
> + vmseq.vx v16, v2, zero, v0.t /* check loaded bytes for null */
> + vmv.v.v v0, v26 /* rhs mask */
> + vfirst.m t2, v16 /* get lhs check result */
> + bgez t2, .Ltail /* can we safely check rest */
> + vle8.v v4, (a1), v0.t /* masked load from rhs */
> + vmseq.vx v18, v4, zero, v0.t /* check partial rhs */
> + vmnot.m v0, v28 /* mask for rest of lhs */
> + vfirst.m t3, v18 /* get check result */
> + bltz t3, 2f /* test it */
> + bge t3, t6, .Ltail
> +
> + vmsleu.vx v0, v30, t3 /* select rest of string + null */
> + vmsne.vv v0, v2, v4, v0.t /* compare */
> + vfirst.m t3, v0
> + bgez t3, 3f
> + mv a0, zero
> + ret
> +3: add a0, a0, t3
> + add a1, a1, t3
> + lbu t0, (a0)
> + lbu t1, (a1)
> +.Ldiff:
> + sub a0, t0, t1
> + ret
> +
> + /* ...no null terminator in first part of lhs or rhs */
> +2: vle8.v v2, (a0), v0.t /* load rest of lhs */
> + vmnot.m v0, v26 /* mask for rest of rhs */
> + vle8.v v4, (a1), v0.t /* load rest of rhs */
> + vmsne.vv v0, v2, v4 /* compare */
> + add a0, a0, t1 /* advance ptrs */
> + vfirst.m t3, v0
> + add a1, a1, t1
> + sub a2, a2, t1
> + bltz t3, 1b
> +
> + sub t3, t3, t1 /* found a diff but we've already advanced a0 and a1 */
> + j 3b
> +
> +.Ltail:
> + beqz a2, 1f
> + addi a2, a2, -1
> + lbu t0, (a0)
> + lbu t1, (a1)
> + bne t0, t1, .Ldiff
> + addi a0, a0, 1
> + addi a1, a1, 1
> + bnez t0, .Ltail
> +1: mv a0, zero
> + ret
> +
> +
> +.size strncmp, .-strncmp
> +libc_hidden_builtin_def (strncmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S b/sysdeps/riscv/rv64/rvv/strncpy.S
> new file mode 100644
> index 0000000000..8b3a1e545c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
> @@ -0,0 +1,96 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strncpy
> +.type strncpy,@function
> +
> +/*
> + * optimized strcpy for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strncpy:
> + mv t0, a0 /* need to return dest so copy */
> +
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> +
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, a1, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search to align the pointer */
> + vsetvli zero, t2, e8, m2, tu, mu
> + vle8.v v2, (a1)
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4 /* copy to dest */
> + vfirst.m t3, v4
> + bgeu t2, a2, .Ldest_full
> + vse8.v v2, (t0), v0.t
> + bgez t3, .Lterminator_found
> + add t0, t0, t2
> + add a1, a1, t2
> + sub a2, a2, t2
> + beqz a2, .Ldone
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:
> + vle8.v v2, (a1)
> + add a1, a1, t1
> + vmseq.vx v4, v2, zero
> + vmsif.m v0, v4
> + vfirst.m t3, v4
> + bgeu t1, a2, .Ldest_full
> + vse8.v v2, (t0), v0.t
> + add t0, t0, t1
> + sub a2, a2, t1
> + bltz t3, 1b
> + sub t0, t0, t1
> +
> +.Lterminator_found:
> + addi sp, sp, -16
> + sd ra, 0(sp)
> + sd a0, 8(sp)
> + add a0, t0, t3
> + mv a1, zero
> + sub a2, a2, t3
> + jal ra, memset
> + ld ra, 0(sp)
> + ld a0, 8(sp)
> + addi sp, sp, 16
> +.Ldone:
> + ret
> +
> +.Ldest_full:
> + vid.v v6
> + vmsltu.vx v4, v6, a2
> + vmand.mm v0, v0, v4
> + vse8.v v2, (t0), v0.t
> + ret
> +
> +.size strncpy, .-strncpy
> +libc_hidden_builtin_def (strncpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S b/sysdeps/riscv/rv64/rvv/strnlen.S
> new file mode 100644
> index 0000000000..6d7ee65c7a
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
> @@ -0,0 +1,81 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl __strnlen
> +.type __strnlen,@function
> +
> +/* vector optimized strnlen
> + * assume it's safe to read to the end of the page
> + * containing either a null terminator or the last byte of the count or both,
> + * but not past it
> + * assume page size >= vlenb*2
> + */
> +
> +.align 2
> +__strnlen:
> + mv t4, a0 /* stash a copy of start for later */
> + beqz a1, .LzeroCount
> +
> + csrr t1, vlenb /* find vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, a1, a0
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search to align pointer to t1 */
> + bgeu t2, a1, 2f /* check it's safe */
> + mv t2, a1 /* it's not! look as far as permitted */
> +2: vsetvli t2, t2, e8, m2, ta, ma
> + vle8.v v2, (a0)
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + add a0, a0, t2
> + sub a1, a1, t2
> + bltu a1, t1, .LreachedCount
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, ma /* do 2*vlenb bytes per pass */
> +
> +1: vle8.v v2, (a0)
> + sub a1, a1, t1
> + vmseq.vx v0, v2, zero
> + vfirst.m t3, v0
> + bgez t3, .Lfound
> + add a0, a0, t1
> + bgeu a1, t1, 1b
> +.LreachedCount:
> + mv t2, a1 /* in case 0 < a1 < t1 */
> + bnez a1, 2b /* if so, still t2 bytes to check, all safe */
> +.LzeroCount:
> + sub a0, a0, t4
> + ret
> +
> +.Lfound: /* found the 0; subtract buffer start from current pointer */
> + add a0, a0, t3 /* and add offset into fetched data */
> + sub a0, a0, t4
> + ret
> +
> +.size __strnlen, .-__strnlen
> +weak_alias (__strnlen, strnlen)
> +libc_hidden_builtin_def (__strnlen)
> +libc_hidden_builtin_def (strnlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S b/sysdeps/riscv/rv64/rvv/strrchr.S
> new file mode 100644
> index 0000000000..4bef8a3b9c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
> @@ -0,0 +1,88 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strrchr
> +.type strrchr,@function
> +
> +/*
> + * optimized strrchr for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 2*vlenb
> + */
> +
> +.align 2
> +strrchr:
> + mv t5, a0 /* stash buffer ptr somewhere safe */
> + mv a0, zero /* result is nullptr unless we find better below */
> +
> + csrr t1, vlenb /* determine vlenb*2 */
> + add t1, t1, t1
> + addi t2, t1, -1 /* mask off unaligned part of ptr */
> + and t2, t5, t2
> + beqz t2, .Laligned
> +
> + sub t2, t1, t2 /* search to align ptr to 2*vlenb */
> + vsetvli t2, t2, e8, m2, ta, mu
> +
> + vle8.v v2, (t5) /* load data into v2(,v3) */
> + vmseq.vx v4, v2, zero /* check for null terminator */
> + vfirst.m t4, v4 /* grab its position, if any */
> + vmsbf.m v0, v4 /* select valid chars */
> + vmseq.vx v0, v2, a1, v0.t /* search for candidate byte */
> + vfirst.m t3, v0 /* grab its position, if any */
> + bltz t3, 2f /* did we find a candidate? */
> +
> +3: add a0, t3, t5 /* we did! grab the address */
> + vmsof.m v1, v0 /* there might be more than one */
> + vmandn.mm v0, v0, v1 /* so clear the one we just found */
> + vfirst.m t3, v0 /* is there another? */
> + bgez t3, 3b
> +
> +2: bgez t4, .Ldone /* did we see a null terminator? */
> + add t5, t5, t2
> +
> +.Laligned:
> + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1: vle8.v v2, (t5)
> + vmseq.vx v4, v2, zero
> + vfirst.m t4, v4
> + vmsbf.m v0, v4
> + vmseq.vx v0, v2, a1, v0.t
> + vfirst.m t3, v0
> + bltz t3, 2f
> +
> +3: add a0, t3, t5
> + vmsof.m v1, v0
> + vmandn.mm v0, v0, v1
> + vfirst.m t3, v0
> + bgez t3, 3b
> +
> +2: add t5, t5, t1
> + bltz t4, 1b
> +
> +.Ldone:
> + ret
> +
> +.size strrchr, .-strrchr
> +libc_hidden_builtin_def (strrchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strspn.S b/sysdeps/riscv/rv64/rvv/strspn.S
> new file mode 100644
> index 0000000000..2b9af5cc2d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strspn.S
> @@ -0,0 +1,189 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +#include <sysdep.h>
> +
> +.globl strspn
> +.type strspn,@function
> +
> +.globl strcspn
> +.type strcspn,@function
> +
> +/*
> + * optimized strspn / strcspn for riscv with vector extension
> + * assumptions:
> + * - vlenb is a power of 2
> + * - page size >= 32
> + * strategy:
> + * - build a 256-bit table on the stack, where each elt is zero
> + * if encountering it should terminate computation and nonzero otherwise
> + * - use vectorised lookups into this to check 2*vlen elts at a time;
> + * this code is identical for strspan and strcspan and can be shared
> + *
> + * note that while V mandates at least 128 bit wide regs,
> + * we are building a 256 bit lookup table
> + * therefore we use either LMUL=1 or 2 depending on what the target supports
> + * therefore we only use even vector register numbers,
> + * so everything still works if we go with LMUL=2
> + */
> +
> +# -----------------------------
> +
> +.align 2
> +
> +strspn:
> + lbu t0, 0(a1)
> + bnez t0, .Lbuild_table
> + mv a0, zero
> + ret
> +
> +.Lbuild_table:
> + mv a6, a0 /* store incoming a0 */
> + li t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
> +
> + vsetvli zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> + /* we want to build a 256-bit table, so use vlenb*2,
> + * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
> + * 'V' extension specifies a minimum vlen of 128 so this should cover
> + * all cases; we can skip the check if we know vlen >= 256 at compile time
> + */
> + csrr t2, vlenb
> + bgeu t2, t1, 1f
> + vsetvli zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> + /* read one char from the charset at a time and write the correct bit
> + * in the lookup table; we could do SIMD iff we ever get an extension
> + * that provides some way of scattering bytes into a reg group
> + */
> + vmv.v.x v16, zero /* clear out table */
> + vmv.v.x v8, zero /* clear out v8 */
> + li t3, 1
> + vmv.s.x v8, t3 /* v8 now all zeroes except bottom byte */
> +
> +1: vmv.v.x v2, zero /* clear out v2 */
> + addi a1, a1, 1 /* advance charset ptr */
> + srli t2, t0, 3 /* divide the byte we read earlier by 8 */
> + vslideup.vx v2, v8, t2 /* v2 now 1 in the correct byte 0 elsewhere */
> + vsll.vx v2, v2, t0 /* v2 now 1 in the correct bit, 0 elsewhere */
> + vor.vv v16, v16, v2 /* or it in */
> + lbu t0, 0(a1) /* fetch next bute */
> + bnez t0, 1b /* if it's null, go round again */
> +
> +/*
> + * Table is now built in v16.
> + * Strategy:
> + * - fetch next t1 bytes from memory
> + * - vrgather on their values divided by 8 to get relevant bytes of table
> + * - shift right to get the correct bit into bit 1
> + * - and with 1, compare with expected terminator value, then check mask
> + * to see if we've found a terminator
> + *
> + * Before we can begin, a0 needs to be t1-aligned, so that when we fetch
> + * the next t1 bytes - any of which may be the null terminator -
> + * we do not cross a page boundary and read unmapped memory. Therefore
> + * we have one read of however many bytes are needed to align a0,
> + * before the main loop.
> + */
> +
> +.Lscan_table:
> + vmv.v.x v8, t3 /* v8 now t1 bytes of 0x01 */
> +
> + and t2, a0, t1 /* mask to align to t1 */
> + beqz t2, 2f /* or skip if we're already aligned */
> + sub t2, t1, t2 /* t2 now bytes to read to align to t1 */
> +
> + vid.v v2 /* build mask instead of changing vl */
> + vmsltu.vx v0, v2, t2 /* so we don't need to track LMUL */
> +
> + vle8.v v2, (a0), v0.t /* load next bytes from input */
> + vsrl.vi v4, v2, 3 /* divide by 8 */
> + vrgather.vv v6, v16, v4 /* corresponding bytes of bit table */
> + vsrl.vv v6, v6, v2 /* shift correct bits to lsb */
> + vand.vv v6, v6, v8 /* and with 1 to complete the lookups */
> + vmseq.vx v4, v6, zero, v0.t /* check to see if any 0s are present */
> + vfirst.m t0, v4 /* index of the first 0, if any */
> + bgez t0, .Lscan_end /* if we found one, stop */
> + add a0, a0, t2 /* advance by number of bytes we read */
> +
> +2: add a6, a6, t1 /* we'll advance a0 before the exit check */
> +1: vle8.v v2, (a0) /* as above but unmasked so t1 elts per pass */
> + add a0, a0, t1
> +
> + vsrl.vi v4, v2, 3
> + vrgather.vv v6, v16, v4
> + vsrl.vv v6, v6, v2
> + vand.vv v6, v6, v8
> +
> + vmseq.vx v4, v6, zero
> + vfirst.m t0, v4
> + bltz t0, 1b
> +
> +.Lscan_end:
> + add a0, a0, t0 /* calculate offset to terminating byte */
> + sub a0, a0, a6
> + ret
> +.size strspn, .-strspn
> +
> +/* strcspn
> + *
> + * table build exactly as for strspn, except:
> + * - the lookup table starts with all bits except bit 0 of byte 0 set
> + * - we clear the corresponding bit for each byte in the charset
> + * once table is built, we can reuse the scan code directly
> + */
> +
> +strcspn:
> + lbu t0, 0(a1)
> + beqz t0, strlen /* no rejections -> prefix is whole string */
> +
> + mv a6, a0
> + li t1, 32
> +
> + vsetvli zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> + csrr t2, vlenb
> + bgeu t2, t1, 1f
> + vsetvli zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> + vmv.v.x v8, zero
> + li t3, 1 /* all bits clear except bit 0 of byte 0 */
> + vmv.s.x v8, t3
> + vnot.v v16, v8 /* v16 is the inverse of that */
> + li t4, -1
> +
> +1: vmv.v.x v2, zero
> + addi a1, a1, 1 /* advance charset ptr */
> + srli t2, t0, 3 /* select correct bit in v2 */
> + vslideup.vx v2, v8, t2
> + vsll.vx v2, v2, t0
> + vnot.v v2, v2 /* invert */
> + vand.vv v16, v16, v2 /* clear the relevant bit of table */
> + lbu t0, 0(a1)
> + bnez t0, 1b
> + j .Lscan_table
> +.size strcspn, .-strcspn
> +
> +libc_hidden_builtin_def (strspn)
> +libc_hidden_builtin_def (strcspn)
> \ No newline at end of file
> --
> 2.34.1
>
Thank you for the feedback!
The idea of this changeset is to provide vectorised implementations for the
specific case where someone opts in by explicitly describing the build
target as supporting the "V" extension via the build environment when
building glibc; where this is not done, existing behaviour is unchanged.
I agree that multiarch/ifuncs absolutely has to be the way going forward.
Most people in the wild will be using prebuilt binaries, and generally
these will be built for the lowest common denominator; we do want to use
optimal code where we can detect the target supports it even if this was
not known at build time. Moreover, we can reasonably support much more
specific optimisation (e.g. specialise on VLENB, determine whether high
LMUL or low LMUL / more unrolls is preferable etc) via ifuncs than at
compile time.
Even once the compiler issues are sorted out, coming up with a robust
mechanism for probing the relevant properties of the environment for ifuncs
is a significant engineering challenge on RISCV, and one that I expect to
be addressing going forward. Once that is in place, it will be possible via
ifuncs to make use of the vectorised implementations even in binaries where
the V extension was not explicitly target at compile time, if it is
available at runtime.
Getting the optimised implementations out early for anyone opting in to
them will help anyone else working in this space and anyone looking at
benchmarks on specific targets; it would also get the code in front of more
eyeballs than just mine while I crack on with ifuncs.
Certainly, though, if the ifuncs infrastructure has already been created by
someone else and can be shared, that would be super helpful, and if the
community would prefer I keep these private until that point we can do that.
On Wed, Feb 1, 2023 at 5:07 PM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
> On 2/1/23 09:42, Florian Weimer wrote:
> > * Jeff Law via Libc-alpha:
> >
> >> On 2/1/23 02:52, Sergei Lewis wrote:
> >>> Initial implementations of memchr, memcmp, memcpy, memmove, memset,
> strchr,
> >>> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> >>> targeting the riscv "V" extension, version 1.0
> >>> The vectorised implementations assume VLENB of at least 128 and at
> >>> least 32
> >>> registers (as mandated by the "V" extension spec). They also assume
> that
> >>> VLENB is a power of two which is no larger than the page size, and (as
> >>> vectorised code in glibc for other platforms does) that it is safe to
> read
> >>> past null terminators / buffer ends provided one does not cross a page
> >>> boundary.
> >>> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
> >>> ---
> >>> sysdeps/riscv/rv64/rvv/Implies | 2 +
> >>> sysdeps/riscv/rv64/rvv/memchr.S | 127 +++++++++++++++++++
> >>> sysdeps/riscv/rv64/rvv/memcmp.S | 93 ++++++++++++++
> >>> sysdeps/riscv/rv64/rvv/memcpy.S | 154 +++++++++++++++++++++++
> >>> sysdeps/riscv/rv64/rvv/memmove.c | 22 ++++
> >>> sysdeps/riscv/rv64/rvv/memset.S | 89 ++++++++++++++
> >>> sysdeps/riscv/rv64/rvv/strchr.S | 92 ++++++++++++++
> >>> sysdeps/riscv/rv64/rvv/strchrnul.c | 22 ++++
> >>> sysdeps/riscv/rv64/rvv/strcmp.S | 108 +++++++++++++++++
> >>> sysdeps/riscv/rv64/rvv/strcpy.S | 72 +++++++++++
> >>> sysdeps/riscv/rv64/rvv/strcspn.c | 22 ++++
> >>> sysdeps/riscv/rv64/rvv/strlen.S | 67 ++++++++++
> >>> sysdeps/riscv/rv64/rvv/strncmp.S | 104 ++++++++++++++++
> >>> sysdeps/riscv/rv64/rvv/strncpy.S | 96 +++++++++++++++
> >>> sysdeps/riscv/rv64/rvv/strnlen.S | 81 +++++++++++++
> >>> sysdeps/riscv/rv64/rvv/strrchr.S | 88 ++++++++++++++
> >>> sysdeps/riscv/rv64/rvv/strspn.S | 189
> +++++++++++++++++++++++++++++
> >> Does this need to be revamped given the recent push to do more with
> >> generic code and target specific hooks for mem* and str*?
> >>
> >> Shouldn't the implementations be in a multiarch directory? I would
> >> fully expect we're going to need both a vector and scalar
> >> implementation selected by an ifunc.
> >
> > I think most RISC-V GCC compilers won't have enabled IFUNC support?
> > Looking at gcc/config.gcc in GCC 12, I see this:
> >
> > *-*-linux* | *-*-gnu*)
> > case ${target} in
> > aarch64*-* | arm*-* | i[34567]86-* | powerpc*-* | s390*-* |
> sparc*-* | x86_64-* | loongarch*-*)
> > default_gnu_indirect_function=yes
> > ;;
> > esac
> >
> > But maybe that's not the right place to look at?
> Clearly something we need to fix.
>
> I'd hesitate to turn on the gcc bits without having the kernel/user
> interface settled. There was a proposal that added a syscall to get the
> processor capabilities -- I'd asked the authors to reach out to you and
> Carlos on whether or not that was acceptable for glibc. I'm not sure if
> that happened or not.
>
> >
> > We have an assembler hack to be able to still build IFUNC resolvers
> > written in C, but I don't know if this works on RISC-V.
> It probably doesn't yet.
>
> >
> > Ideally the GCC defaults would change, too, and well before IFUNCs are
> > in common use.
> They're not common, but I suspect that'll change in the next ~6 months.
>
> Jeff
>
Thank you very much for the detailed review!
> > +#ifndef __riscv_strict_align
> Would this be defined by compiler as predefine macro or is it just a debug
> switch? If the later, I think it would be better to remove it.
The intent is to make use of the gcc feature in flight here:
https://gcc.gnu.org/pipermail/gcc-patches/2023-January/610115.html to
detect the situation where the build environment has been configured to
avoid unaligned access.
> It is really worth to add a strrchr optimization? The generic
implementation
> already calls strchr (which should be optimized).
The performance win is actually quite significant; consider searching for
the first space in a piece of text compared to the last space - reusing
strchr in a loop as the generic implementation does would cause branches
every few bytes of input, and essentially destroy all benefits gained from
vectorisation. Worse, vectorised functions typically have a few
instructions of setup/overhead before the work begins and these would
likewise be repeated every few bytes of input.
> I wonder if we could adapt the generic implementation, so riscv only
reimplements
> the vectorized search instead of all the boilerplace to generate the
table and
> early tests.
The issue is that the table looks different for different implementations,
and possibly even for different cases in the same implementation; e.g. some
of the existing implementations use a 256-byte table with one byte per
character rather than a 256-bit bitfield as I do here (and going forward we
would potentially want such a path for riscv as well and select between
them based on the length of the character set - common use in parsing will
tend to produce very small character sets, but if we get a large one or
potentially always depending on architecture, using indexed loads/stores
will become faster than the bitfield approach I use here).
I am integrating all the other feedback, and will also work with your
changes to the generic implementations - it looks like there is quite a bit
of potential to reduce and simplify my changeset once yours goes in.
On Wed, Feb 1, 2023 at 5:38 PM Adhemerval Zanella Netto <
adhemerval.zanella@linaro.org> wrote:
>
>
> On 01/02/23 06:52, Sergei Lewis wrote:
> > Initial implementations of memchr, memcmp, memcpy, memmove, memset,
> strchr,
> > strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> > targeting the riscv "V" extension, version 1.0
> >
> > The vectorised implementations assume VLENB of at least 128 and at least
> 32
> > registers (as mandated by the "V" extension spec). They also assume that
> > VLENB is a power of two which is no larger than the page size, and (as
> > vectorised code in glibc for other platforms does) that it is safe to
> read
> > past null terminators / buffer ends provided one does not cross a page
> > boundary.
> >
> > Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
>
> Some comments that might be useful since I am working the generic
> implementations
> below.
>
> Also, I think it should be splitted with one implementation per patch,
> unless the
> implementation is tied together (as for strchr/strchrnul for instance).
> Does
> the vectorized routine only work for rv64?
>
> > ---
> > sysdeps/riscv/rv64/rvv/Implies | 2 +
> > sysdeps/riscv/rv64/rvv/memchr.S | 127 +++++++++++++++++++
> > sysdeps/riscv/rv64/rvv/memcmp.S | 93 ++++++++++++++
> > sysdeps/riscv/rv64/rvv/memcpy.S | 154 +++++++++++++++++++++++
> > sysdeps/riscv/rv64/rvv/memmove.c | 22 ++++
> > sysdeps/riscv/rv64/rvv/memset.S | 89 ++++++++++++++
> > sysdeps/riscv/rv64/rvv/strchr.S | 92 ++++++++++++++
> > sysdeps/riscv/rv64/rvv/strchrnul.c | 22 ++++
> > sysdeps/riscv/rv64/rvv/strcmp.S | 108 +++++++++++++++++
> > sysdeps/riscv/rv64/rvv/strcpy.S | 72 +++++++++++
> > sysdeps/riscv/rv64/rvv/strcspn.c | 22 ++++
> > sysdeps/riscv/rv64/rvv/strlen.S | 67 ++++++++++
> > sysdeps/riscv/rv64/rvv/strncmp.S | 104 ++++++++++++++++
> > sysdeps/riscv/rv64/rvv/strncpy.S | 96 +++++++++++++++
> > sysdeps/riscv/rv64/rvv/strnlen.S | 81 +++++++++++++
> > sysdeps/riscv/rv64/rvv/strrchr.S | 88 ++++++++++++++
> > sysdeps/riscv/rv64/rvv/strspn.S | 189 +++++++++++++++++++++++++++++
> > 17 files changed, 1428 insertions(+)
> > create mode 100644 sysdeps/riscv/rv64/rvv/Implies
> > create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
> > create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
> > create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
> > create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
> >
> > diff --git a/sysdeps/riscv/rv64/rvv/Implies
> b/sysdeps/riscv/rv64/rvv/Implies
> > new file mode 100644
> > index 0000000000..b07b4cb906
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/Implies
> > @@ -0,0 +1,2 @@
> > +riscv/rv64/rvd
> > +
> > diff --git a/sysdeps/riscv/rv64/rvv/memchr.S
> b/sysdeps/riscv/rv64/rvv/memchr.S
> > new file mode 100644
> > index 0000000000..a7e32b8f25
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/memchr.S
> > @@ -0,0 +1,127 @@
> > +
>
> Spurious new line at the start. We also require a brief comment describing
> the file contents for newer files.
>
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
>
> Not sure 2012 range fits here.
>
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +
> > +/* Optimised memchr for riscv with vector extension
> > + * Assumptions:
> > + * - cpu becomes bandwidth limited at or before
> > + * 2 vector register sized read/write operations
> > + * + 2 scalar operations
> > + * + conditional branch
> > + */
> > +
> > +.globl memchr
> > +.type memchr,@function
> > +
> > +.align 2
> > +memchr:
>
> We have the ENTRY macro for that.
>
> > + beqz a2, .Lnot_found
>
> Maybe use the L macro here for local labels;
>
> > + csrr t1, vlenb
> > + bgeu a2, t1, .Lvector_path /* only use vector path if we're
> scanning
> > + at least vlenb bytes */
> > +
> > +#ifndef __riscv_strict_align
>
> Would this be defined by compiler as predefine macro or is it just a debug
> switch? If the later, I think it would be better to remove it.
>
> > + li a3, 8
> > + blt a2, a3, .Lbytewise
> > +
> > + li t1, 0x0101010101010101
> > + slli a4, t1, 7 /* a4 = 0x8080808080808080 */
> > + mul t2, a1, t1 /* entirety of t2 is now repeats of target
> character;
> > + assume mul is at worst no worse than
> 3*(shift+OR),
> > + otherwise do that instead */
> > +
> > +/*
> > + * strategy:
> > + * t4 = ((*a0) ^ t2)
> > + * - now t4 contains zero bytes if and only if next word of memory
> > + * had target character at those positions
> > + *
> > + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
> > + * - all nonzero bytes of t4 become 0; zero bytes become 0x80
> > + *
> > + * if t4 is nonzero, find the index of the byte within it, add to a0
> and return
> > + * otherwise, loop
> > + */
> > +
> > +1:
> > + ld t4, (a0) /* t4 = load next 8 bytes */
> > + xor t4, t4, t2
> > + sub t5, t4, t1
> > + not t4, t4
> > + and t4, t5, t4
> > + and t4, t4, a4
> > + bnez t4, .Lbytewise /* could use ctzw, mod+lookup or just binary
> chop
> > + to locate byte of interest in t4 but
> profiling
> > + shows these approaches are at best no
> better */
> > + addi a2, a2, -8
> > + addi a0, a0, 8
> > + bgeu a2, a3, 1b
> > + beqz a2, .Lnot_found
> > +#endif // __riscv_strict_align
> > +
> > +/* too little data for a dword. mask calculation and branch mispredict
> costs
> > + make checking a word not worthwhile. degrade to bytewise search. */
> > +
> > +.Lbytewise:
> > + add t2, a0, a2
> > +
> > +1:
> > + lb t1, (a0)
> > + beq t1, a1, .Lfound
> > + addi a0, a0, 1
> > + blt a0, t2, 1b
> > +
> > +.Lnot_found:
> > + mv a0, zero
> > +.Lfound:
> > + ret
> > +
> > +.Lvector_path:
> > + vsetvli t2, a2, e8, m2, ta, ma
> > +
> > +1:
> > + vle8.v v2, (a0)
> > + vmseq.vx v0, v2, a1
> > + vfirst.m t3, v0
> > + bgez t3, .Lvec_found
> > + add a0, a0, t2
> > + sub a2, a2, t2
> > + bge a2, t2, 1b
> > + bnez a2, 2f
> > + mv a0, zero
> > + ret
> > +
> > +2:
> > + vsetvli t2, a2, e8, m2, ta, ma
> > + vle8.v v2, (a0)
> > + vmseq.vx v0, v2, a1
> > + vfirst.m t3, v0
> > + bgez t3, .Lvec_found
> > + mv a0, zero
> > + ret
> > +
> > +.Lvec_found:
> > + add a0, a0, t3
> > + ret
> > +
> > +.size memchr, .-memchr
> > +libc_hidden_builtin_def (memchr)
> > \ No newline at end of file
>
> Please add a newline.
>
> > diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S
> b/sysdeps/riscv/rv64/rvv/strcpy.S
> > new file mode 100644
> > index 0000000000..b21909d66f
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
> > @@ -0,0 +1,72 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
>
> You can add a optimize stpcpy and use to implement strcpy on top of that
> (as my generic proposal does [1]). ARMv6 does something similar [2]
>
> [1]
> https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-12-adhemerval.zanella@linaro.org/
> [2]
> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/arm/armv6/strcpy.S;h=e9f63a56c1c605a21b05f7ac21412585b0705171;hb=HEAD
>
> > +#include <sysdep.h>
> > +
> > +.globl strcpy
> > +.type strcpy,@function
> > +
> > +/*
> > + * optimized strcpy for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 2*vlenb
> > + */
> > +
> > +.align 2
> > +strcpy:
> > + mv t0, a0 /* copy dest so we can
> return it */
> > +
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > +
> > + addi t2, t1, -1 /* mask unaligned part of
> ptr */
> > + and t2, a1, t2
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search enough to align
> ptr */
> > + vsetvli t2, t2, e8, m2, tu, mu
> > + vle8.v v2, (a1)
> > + vmseq.vx v4, v2, zero
> > + vmsif.m v0, v4 /* copy but not past null */
> > + vfirst.m t3, v4
> > + vse8.v v2, (t0), v0.t
> > + bgez t3, .Ldone
> > + add t0, t0, t2
> > + add a1, a1, t2
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per
> pass */
> > +
> > +1:
> > + vle8.v v2, (a1)
> > + add a1, a1, t1
> > + vmseq.vx v4, v2, zero
> > + vmsif.m v0, v4
> > + vfirst.m t3, v4
> > + vse8.v v2, (t0), v0.t
> > + add t0, t0, t1
> > + bltz t3, 1b
> > +
> > +.Ldone:
> > + ret
> > +
> > +.size strcpy, .-strcpy
> > +libc_hidden_builtin_def (strcpy)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c
> b/sysdeps/riscv/rv64/rvv/strcspn.c
> > new file mode 100644
> > index 0000000000..f0595a72fb
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
> > @@ -0,0 +1,22 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +/* strcspn is implemented in strspn.S
> > + */
> > diff --git a/sysdeps/riscv/rv64/rvv/strlen.S
> b/sysdeps/riscv/rv64/rvv/strlen.S
> > new file mode 100644
> > index 0000000000..c77d500693
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strlen.S
> > @@ -0,0 +1,67 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl strlen
> > +.type strlen,@function
> > +
> > +/*
> > + * optimized strlen for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 2*vlenb
> > + */
> > +
> > +.align 2
> > +strlen:
> > + mv t4, a0 /* copy of buffer start */
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > + addi t2, t1, -1 /* mask off unaligned part of
> ptr */
> > + and t2, a0, t2
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search fwd to align ptr */
> > + vsetvli t2, t2, e8, m2, ta, ma
> > + vle8.v v2, (a0)
> > + vmseq.vx v0, v2, zero
> > + vfirst.m t3, v0
> > + bgez t3, .Lfound
> > + add a0, a0, t2
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, ma /* search 2*vlenb bytes per
> pass */
> > + add t4, t4, t1
> > +
> > +1:
> > + vle8.v v2, (a0)
> > + add a0, a0, t1
> > + vmseq.vx v0, v2, zero
> > + vfirst.m t3, v0
> > + bltz t3, 1b
> > +
> > +.Lfound: /* found the 0; subtract
> */
> > + sub a0, a0, t4 /* buffer start from current
> ptr */
> > + add a0, a0, t3 /* and add offset into
> fetched */
> > + ret /* data to get length */
> > +
> > +.size strlen, .-strlen
> > +libc_hidden_builtin_def (strlen)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S
> b/sysdeps/riscv/rv64/rvv/strncmp.S
> > new file mode 100644
> > index 0000000000..863e5cb525
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
> > @@ -0,0 +1,104 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl strncmp
> > +.type strncmp,@function
> > +
> > +.align 2
> > +
> > +/* as strcmp, but with added checks on a2 (max count)
> > + */
> > +
> > +strncmp:
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > + blt a2, t1, .Ltail /* degrade if max < vlenb*2
> */
> > + vsetvli zero, t1, e8, m2, ta, mu
> > + vid.v v30
> > + addi t2, t1, -1 /* mask unaligned part of
> ptr */
> > + and t6, a0, t2 /* unaligned part of lhs */
> > + and t5, a1, t2 /* unaligned part of rhs */
> > + sub t6, t1, t6 /* safe count to read from
> lhs */
> > + sub t5, t1, t5 /* same, rhs */
> > + vmsltu.vx v28, v30, t6 /* mask for first part of
> lhs */
> > + vmsltu.vx v26, v30, t5 /* mask for first part of
> rhs */
> > + vmv.v.x v16, zero
> > + vmv.v.x v18, zero
> > +
> > +
> > +1: blt a2, t1, .Ltail
> > + vmv.v.v v0, v28 /* lhs mask */
> > + vle8.v v2, (a0), v0.t /* masked load from lhs */
> > + vmseq.vx v16, v2, zero, v0.t /* check loaded bytes for
> null */
> > + vmv.v.v v0, v26 /* rhs mask */
> > + vfirst.m t2, v16 /* get lhs check result */
> > + bgez t2, .Ltail /* can we safely check rest
> */
> > + vle8.v v4, (a1), v0.t /* masked load from
> rhs */
> > + vmseq.vx v18, v4, zero, v0.t /* check partial rhs
> */
> > + vmnot.m v0, v28 /* mask for rest of lhs */
> > + vfirst.m t3, v18 /* get check result */
> > + bltz t3, 2f /* test it */
> > + bge t3, t6, .Ltail
> > +
> > + vmsleu.vx v0, v30, t3 /* select rest of string +
> null */
> > + vmsne.vv v0, v2, v4, v0.t /* compare */
> > + vfirst.m t3, v0
> > + bgez t3, 3f
> > + mv a0, zero
> > + ret
> > +3: add a0, a0, t3
> > + add a1, a1, t3
> > + lbu t0, (a0)
> > + lbu t1, (a1)
> > +.Ldiff:
> > + sub a0, t0, t1
> > + ret
> > +
> > + /* ...no null terminator in first part of lhs or rhs */
> > +2: vle8.v v2, (a0), v0.t /* load rest of lhs */
> > + vmnot.m v0, v26 /* mask for rest of rhs */
> > + vle8.v v4, (a1), v0.t /* load rest of rhs */
> > + vmsne.vv v0, v2, v4 /* compare */
> > + add a0, a0, t1 /* advance ptrs */
> > + vfirst.m t3, v0
> > + add a1, a1, t1
> > + sub a2, a2, t1
> > + bltz t3, 1b
> > +
> > + sub t3, t3, t1 /* found a diff but we've already advanced a0 and
> a1 */
> > + j 3b
> > +
> > +.Ltail:
> > + beqz a2, 1f
> > + addi a2, a2, -1
> > + lbu t0, (a0)
> > + lbu t1, (a1)
> > + bne t0, t1, .Ldiff
> > + addi a0, a0, 1
> > + addi a1, a1, 1
> > + bnez t0, .Ltail
> > +1: mv a0, zero
> > + ret
> > +
> > +
> > +.size strncmp, .-strncmp
> > +libc_hidden_builtin_def (strncmp)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S
> b/sysdeps/riscv/rv64/rvv/strncpy.S
> > new file mode 100644
> > index 0000000000..8b3a1e545c
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
> > @@ -0,0 +1,96 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl strncpy
> > +.type strncpy,@function
> > +
> > +/*
> > + * optimized strcpy for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 2*vlenb
> > + */
> > +
> > +.align 2
> > +strncpy:
> > + mv t0, a0 /* need to return dest so
> copy */
> > +
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > +
> > + addi t2, t1, -1 /* mask off unaligned part of
> ptr */
> > + and t2, a1, t2
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search to align the
> pointer */
> > + vsetvli zero, t2, e8, m2, tu, mu
> > + vle8.v v2, (a1)
> > + vmseq.vx v4, v2, zero
> > + vmsif.m v0, v4 /* copy to dest */
> > + vfirst.m t3, v4
> > + bgeu t2, a2, .Ldest_full
> > + vse8.v v2, (t0), v0.t
> > + bgez t3, .Lterminator_found
> > + add t0, t0, t2
> > + add a1, a1, t2
> > + sub a2, a2, t2
> > + beqz a2, .Ldone
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per
> pass */
> > +
> > +1:
> > + vle8.v v2, (a1)
> > + add a1, a1, t1
> > + vmseq.vx v4, v2, zero
> > + vmsif.m v0, v4
> > + vfirst.m t3, v4
> > + bgeu t1, a2, .Ldest_full
> > + vse8.v v2, (t0), v0.t
> > + add t0, t0, t1
> > + sub a2, a2, t1
> > + bltz t3, 1b
> > + sub t0, t0, t1
> > +
> > +.Lterminator_found:
> > + addi sp, sp, -16
> > + sd ra, 0(sp)
> > + sd a0, 8(sp)
> > + add a0, t0, t3
> > + mv a1, zero
> > + sub a2, a2, t3
> > + jal ra, memset
> > + ld ra, 0(sp)
> > + ld a0, 8(sp)
> > + addi sp, sp, 16
> > +.Ldone:
> > + ret
> > +
> > +.Ldest_full:
> > + vid.v v6
> > + vmsltu.vx v4, v6, a2
> > + vmand.mm v0, v0, v4
> > + vse8.v v2, (t0), v0.t
> > + ret
> > +
> > +.size strncpy, .-strncpy
> > +libc_hidden_builtin_def (strncpy)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S
> b/sysdeps/riscv/rv64/rvv/strnlen.S
> > new file mode 100644
> > index 0000000000..6d7ee65c7a
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
> > @@ -0,0 +1,81 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
>
> Maybe use a generic implementation that issues memchr (which should be
> optimized
> using vector instructions) [3] ? It would be a extra function call, but
> it should really
> help on both code size and icache pressure.
>
> [3]
> https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-6-adhemerval.zanella@linaro.org/
>
> > +.globl __strnlen
> > +.type __strnlen,@function
> > +
> > +/* vector optimized strnlen
> > + * assume it's safe to read to the end of the page
> > + * containing either a null terminator or the last byte of the count or
> both,
> > + * but not past it
> > + * assume page size >= vlenb*2
> > + */
> > +
> > +.align 2
> > +__strnlen:
> > + mv t4, a0 /* stash a copy of start for later
> */
> > + beqz a1, .LzeroCount
> > +
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > + addi t2, t1, -1 /* mask off unaligned part of ptr
> */
> > + and t2, a1, a0
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search to align pointer to t1 */
> > + bgeu t2, a1, 2f /* check it's safe */
> > + mv t2, a1 /* it's not! look as far as
> permitted */
> > +2: vsetvli t2, t2, e8, m2, ta, ma
> > + vle8.v v2, (a0)
> > + vmseq.vx v0, v2, zero
> > + vfirst.m t3, v0
> > + bgez t3, .Lfound
> > + add a0, a0, t2
> > + sub a1, a1, t2
> > + bltu a1, t1, .LreachedCount
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, ma /* do 2*vlenb bytes per
> pass */
> > +
> > +1: vle8.v v2, (a0)
> > + sub a1, a1, t1
> > + vmseq.vx v0, v2, zero
> > + vfirst.m t3, v0
> > + bgez t3, .Lfound
> > + add a0, a0, t1
> > + bgeu a1, t1, 1b
> > +.LreachedCount:
> > + mv t2, a1 /* in case 0 < a1 < t1 */
> > + bnez a1, 2b /* if so, still t2 bytes to check, all safe */
> > +.LzeroCount:
> > + sub a0, a0, t4
> > + ret
> > +
> > +.Lfound: /* found the 0; subtract buffer start from current
> pointer */
> > + add a0, a0, t3 /* and add offset into fetched data */
> > + sub a0, a0, t4
> > + ret
> > +
> > +.size __strnlen, .-__strnlen
> > +weak_alias (__strnlen, strnlen)
> > +libc_hidden_builtin_def (__strnlen)
> > +libc_hidden_builtin_def (strnlen)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S
> b/sysdeps/riscv/rv64/rvv/strrchr.S
> > new file mode 100644
> > index 0000000000..4bef8a3b9c
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
> > @@ -0,0 +1,88 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <sysdep.h>
>
> It is really worth to add a strrchr optimization? The generic
> implementation
> already calls strchr (which should be optimized).
>
> > +
> > +.globl strrchr
> > +.type strrchr,@function
> > +
> > +/*
> > + * optimized strrchr for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 2*vlenb
> > + */
> > +
> > +.align 2
> > +strrchr:
> > + mv t5, a0 /* stash buffer ptr somewhere safe */
> > + mv a0, zero /* result is nullptr unless we find better
> below */
> > +
> > + csrr t1, vlenb /* determine vlenb*2 */
> > + add t1, t1, t1
> > + addi t2, t1, -1 /* mask off unaligned part of
> ptr */
> > + and t2, t5, t2
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search to align ptr to
> 2*vlenb */
> > + vsetvli t2, t2, e8, m2, ta, mu
> > +
> > + vle8.v v2, (t5) /* load data into v2(,v3) */
> > + vmseq.vx v4, v2, zero /* check for null terminator */
> > + vfirst.m t4, v4 /* grab its position, if any */
> > + vmsbf.m v0, v4 /* select valid chars */
> > + vmseq.vx v0, v2, a1, v0.t /* search for candidate byte */
> > + vfirst.m t3, v0 /* grab its position, if any */
> > + bltz t3, 2f /* did we find a candidate? */
> > +
> > +3: add a0, t3, t5 /* we did! grab the address */
> > + vmsof.m v1, v0 /* there might be more than
> one */
> > + vmandn.mm v0, v0, v1 /* so clear the one we just
> found */
> > + vfirst.m t3, v0 /* is there another? */
> > + bgez t3, 3b
> > +
> > +2: bgez t4, .Ldone /* did we see a null
> terminator? */
> > + add t5, t5, t2
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per
> pass */
> > +
> > +1: vle8.v v2, (t5)
> > + vmseq.vx v4, v2, zero
> > + vfirst.m t4, v4
> > + vmsbf.m v0, v4
> > + vmseq.vx v0, v2, a1, v0.t
> > + vfirst.m t3, v0
> > + bltz t3, 2f
> > +
> > +3: add a0, t3, t5
> > + vmsof.m v1, v0
> > + vmandn.mm v0, v0, v1
> > + vfirst.m t3, v0
> > + bgez t3, 3b
> > +
> > +2: add t5, t5, t1
> > + bltz t4, 1b
> > +
> > +.Ldone:
> > + ret
> > +
> > +.size strrchr, .-strrchr
> > +libc_hidden_builtin_def (strrchr)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strspn.S
> b/sysdeps/riscv/rv64/rvv/strspn.S
> > new file mode 100644
> > index 0000000000..2b9af5cc2d
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strspn.S
> > @@ -0,0 +1,189 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl strspn
> > +.type strspn,@function
> > +
> > +.globl strcspn
> > +.type strcspn,@function
> > +
> > +/*
> > + * optimized strspn / strcspn for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 32
> > + * strategy:
> > + * - build a 256-bit table on the stack, where each elt is zero
> > + * if encountering it should terminate computation and nonzero
> otherwise
> > + * - use vectorised lookups into this to check 2*vlen elts at a time;
> > + * this code is identical for strspan and strcspan and can be shared
> > + *
> > + * note that while V mandates at least 128 bit wide regs,
> > + * we are building a 256 bit lookup table
> > + * therefore we use either LMUL=1 or 2 depending on what the target
> supports
> > + * therefore we only use even vector register numbers,
> > + * so everything still works if we go with LMUL=2
> > + */
> > +
>
> I wonder if we could adapt the generic implementation, so riscv only
> reimplements
> the vectorized search instead of all the boilerplace to generate the table
> and
> early tests.
>
> > +# -----------------------------
> > +
> > +.align 2
> > +
> > +strspn:
> > + lbu t0, 0(a1)
> > + bnez t0, .Lbuild_table
> > + mv a0, zero
> > + ret
> > +
> > +.Lbuild_table:
> > + mv a6, a0 /* store incoming a0 */
> > + li t1, 32 /* want to deal with 256 bits at a time, so 32
> bytes */
> > +
> > + vsetvli zero, t1, e8, m1, tu, mu
> > +#if __riscv_v_min_vlen < 256
> > + /* we want to build a 256-bit table, so use vlenb*2,
> > + * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
> > + * 'V' extension specifies a minimum vlen of 128 so this should
> cover
> > + * all cases; we can skip the check if we know vlen >= 256 at
> compile time
> > + */
> > + csrr t2, vlenb
> > + bgeu t2, t1, 1f
> > + vsetvli zero, t1, e8, m2, tu, mu
> > +1:
> > +#endif // __riscv_v_min_vlen
> > +
> > + /* read one char from the charset at a time and write the correct
> bit
> > + * in the lookup table; we could do SIMD iff we ever get an
> extension
> > + * that provides some way of scattering bytes into a reg group
> > + */
> > + vmv.v.x v16, zero /* clear out table */
> > + vmv.v.x v8, zero /* clear out v8 */
> > + li t3, 1
> > + vmv.s.x v8, t3 /* v8 now all zeroes except bottom byte
> */
> > +
> > +1: vmv.v.x v2, zero /* clear out v2 */
> > + addi a1, a1, 1 /* advance charset ptr */
> > + srli t2, t0, 3 /* divide the byte we read earlier by 8
> */
> > + vslideup.vx v2, v8, t2 /* v2 now 1 in the correct byte 0
> elsewhere */
> > + vsll.vx v2, v2, t0 /* v2 now 1 in the correct bit, 0
> elsewhere */
> > + vor.vv v16, v16, v2 /* or it in */
> > + lbu t0, 0(a1) /* fetch next bute */
> > + bnez t0, 1b /* if it's null, go round again */
> > +
> > +/*
> > + * Table is now built in v16.
> > + * Strategy:
> > + * - fetch next t1 bytes from memory
> > + * - vrgather on their values divided by 8 to get relevant bytes of
> table
> > + * - shift right to get the correct bit into bit 1
> > + * - and with 1, compare with expected terminator value, then check
> mask
> > + * to see if we've found a terminator
> > + *
> > + * Before we can begin, a0 needs to be t1-aligned, so that when we
> fetch
> > + * the next t1 bytes - any of which may be the null terminator -
> > + * we do not cross a page boundary and read unmapped memory. Therefore
> > + * we have one read of however many bytes are needed to align a0,
> > + * before the main loop.
> > + */
> > +
> > +.Lscan_table:
> > + vmv.v.x v8, t3 /* v8 now t1 bytes of 0x01 */
> > +
> > + and t2, a0, t1 /* mask to align to t1 */
> > + beqz t2, 2f /* or skip if we're already aligned
> */
> > + sub t2, t1, t2 /* t2 now bytes to read to align to
> t1 */
> > +
> > + vid.v v2 /* build mask instead of changing
> vl */
> > + vmsltu.vx v0, v2, t2 /* so we don't need to track LMUL */
> > +
> > + vle8.v v2, (a0), v0.t /* load next bytes from input */
> > + vsrl.vi v4, v2, 3 /* divide by 8 */
> > + vrgather.vv v6, v16, v4 /* corresponding bytes of bit table
> */
> > + vsrl.vv v6, v6, v2 /* shift correct bits to lsb */
> > + vand.vv v6, v6, v8 /* and with 1 to complete the
> lookups */
> > + vmseq.vx v4, v6, zero, v0.t /* check to see if any 0s are
> present */
> > + vfirst.m t0, v4 /* index of the first 0, if any */
> > + bgez t0, .Lscan_end /* if we found one, stop */
> > + add a0, a0, t2 /* advance by number of bytes we
> read */
> > +
> > +2: add a6, a6, t1 /* we'll advance a0 before the exit
> check */
> > +1: vle8.v v2, (a0) /* as above but unmasked so t1 elts per
> pass */
> > + add a0, a0, t1
> > +
> > + vsrl.vi v4, v2, 3
> > + vrgather.vv v6, v16, v4
> > + vsrl.vv v6, v6, v2
> > + vand.vv v6, v6, v8
> > +
> > + vmseq.vx v4, v6, zero
> > + vfirst.m t0, v4
> > + bltz t0, 1b
> > +
> > +.Lscan_end:
> > + add a0, a0, t0 /* calculate offset to terminating byte
> */
> > + sub a0, a0, a6
> > + ret
> > +.size strspn, .-strspn
> > +
> > +/* strcspn
> > + *
> > + * table build exactly as for strspn, except:
> > + * - the lookup table starts with all bits except bit 0 of byte 0 set
> > + * - we clear the corresponding bit for each byte in the charset
> > + * once table is built, we can reuse the scan code directly
> > + */
> > +
> > +strcspn:
> > + lbu t0, 0(a1)
> > + beqz t0, strlen /* no rejections -> prefix is whole string
> */
> > +
> > + mv a6, a0
> > + li t1, 32
> > +
> > + vsetvli zero, t1, e8, m1, tu, mu
> > +#if __riscv_v_min_vlen < 256
> > + csrr t2, vlenb
> > + bgeu t2, t1, 1f
> > + vsetvli zero, t1, e8, m2, tu, mu
> > +1:
> > +#endif // __riscv_v_min_vlen
> > +
> > + vmv.v.x v8, zero
> > + li t3, 1 /* all bits clear except bit 0 of byte
> 0 */
> > + vmv.s.x v8, t3
> > + vnot.v v16, v8 /* v16 is the inverse of that */
> > + li t4, -1
> > +
> > +1: vmv.v.x v2, zero
> > + addi a1, a1, 1 /* advance charset ptr */
> > + srli t2, t0, 3 /* select correct bit in v2 */
> > + vslideup.vx v2, v8, t2
> > + vsll.vx v2, v2, t0
> > + vnot.v v2, v2 /* invert */
> > + vand.vv v16, v16, v2 /* clear the relevant bit of table */
> > + lbu t0, 0(a1)
> > + bnez t0, 1b
> > + j .Lscan_table
> > +.size strcspn, .-strcspn
> > +
> > +libc_hidden_builtin_def (strspn)
> > +libc_hidden_builtin_def (strcspn)
> > \ No newline at end of file
>
On 02/02/23 07:02, Sergei Lewis wrote:
> Thank you very much for the detailed review!
>
>> > +#ifndef __riscv_strict_align
>> Would this be defined by compiler as predefine macro or is it just a debug
>> switch? If the later, I think it would be better to remove it.
>
> The intent is to make use of the gcc feature in flight here: https://gcc.gnu.org/pipermail/gcc-patches/2023-January/610115.html <https://gcc.gnu.org/pipermail/gcc-patches/2023-January/610115.html> to detect the situation where the build environment has been configured to avoid unaligned access.
I am not sure this will be a good way forward to glibc, it means *another* variant
to build, check, and validate and, worse, it is not tied to any ABI/cpu but to
a compiler option. I think it would be better to provide vectorized mem* and
str* that work indendently of the compiler option used.
>
>
>> It is really worth to add a strrchr optimization? The generic implementation
>> already calls strchr (which should be optimized).
>
> The performance win is actually quite significant; consider searching for the first space in a piece of text compared to the last space - reusing strchr in a loop as the generic implementation does would cause branches every few bytes of input, and essentially destroy all benefits gained from vectorisation. Worse, vectorised functions typically have a few instructions of setup/overhead before the work begins and these would likewise be repeated every few bytes of input.
Another option, which I will add on my default string refactor, it to use
strlen plus memrchr:
char *strrchr (const char *s, int c)
{
return __memrchr (s, c, strlen(s) + 1);
}
It would only 2 function calls and if the architecture provides optimized
strlen and memrchr, the performance overhead should be only the additional
functions call (which the advantage of less icache pressure).
>
>> I wonder if we could adapt the generic implementation, so riscv only reimplements
>> the vectorized search instead of all the boilerplace to generate the table and
>> early tests.
>
> The issue is that the table looks different for different implementations, and possibly even for different cases in the same implementation; e.g. some of the existing implementations use a 256-byte table with one byte per character rather than a 256-bit bitfield as I do here (and going forward we would potentially want such a path for riscv as well and select between them based on the length of the character set - common use in parsing will tend to produce very small character sets, but if we get a large one or potentially always depending on architecture, using indexed loads/stores will become faster than the bitfield approach I use here).
I recall that I tested using a 256-bit bitfield instead of 256-byte table, but
it incured in some overhead on most architecture (I might check again).
One option might to parametrize both the table generation and the table search,
but it might not be profitable for
>
> I am integrating all the other feedback, and will also work with your changes to the generic implementations - it looks like there is quite a bit of potential to reduce and simplify my changeset once yours goes in.
>
> On Wed, Feb 1, 2023 at 5:38 PM Adhemerval Zanella Netto <adhemerval.zanella@linaro.org <mailto:adhemerval.zanella@linaro.org>> wrote:
>
>
>
> On 01/02/23 06:52, Sergei Lewis wrote:
> > Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> > strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> > targeting the riscv "V" extension, version 1.0
> >
> > The vectorised implementations assume VLENB of at least 128 and at least 32
> > registers (as mandated by the "V" extension spec). They also assume that
> > VLENB is a power of two which is no larger than the page size, and (as
> > vectorised code in glibc for other platforms does) that it is safe to read
> > past null terminators / buffer ends provided one does not cross a page
> > boundary.
> >
> > Signed-off-by: Sergei Lewis <slewis@rivosinc.com <mailto:slewis@rivosinc.com>>
>
> Some comments that might be useful since I am working the generic implementations
> below.
>
> Also, I think it should be splitted with one implementation per patch, unless the
> implementation is tied together (as for strchr/strchrnul for instance). Does
> the vectorized routine only work for rv64?
>
> > ---
> > sysdeps/riscv/rv64/rvv/Implies | 2 +
> > sysdeps/riscv/rv64/rvv/memchr.S | 127 +++++++++++++++++++
> > sysdeps/riscv/rv64/rvv/memcmp.S | 93 ++++++++++++++
> > sysdeps/riscv/rv64/rvv/memcpy.S | 154 +++++++++++++++++++++++
> > sysdeps/riscv/rv64/rvv/memmove.c | 22 ++++
> > sysdeps/riscv/rv64/rvv/memset.S | 89 ++++++++++++++
> > sysdeps/riscv/rv64/rvv/strchr.S | 92 ++++++++++++++
> > sysdeps/riscv/rv64/rvv/strchrnul.c | 22 ++++
> > sysdeps/riscv/rv64/rvv/strcmp.S | 108 +++++++++++++++++
> > sysdeps/riscv/rv64/rvv/strcpy.S | 72 +++++++++++
> > sysdeps/riscv/rv64/rvv/strcspn.c | 22 ++++
> > sysdeps/riscv/rv64/rvv/strlen.S | 67 ++++++++++
> > sysdeps/riscv/rv64/rvv/strncmp.S | 104 ++++++++++++++++
> > sysdeps/riscv/rv64/rvv/strncpy.S | 96 +++++++++++++++
> > sysdeps/riscv/rv64/rvv/strnlen.S | 81 +++++++++++++
> > sysdeps/riscv/rv64/rvv/strrchr.S | 88 ++++++++++++++
> > sysdeps/riscv/rv64/rvv/strspn.S | 189 +++++++++++++++++++++++++++++
> > 17 files changed, 1428 insertions(+)
> > create mode 100644 sysdeps/riscv/rv64/rvv/Implies
> > create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
> > create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
> > create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
> > create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
> > create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
> >
> > diff --git a/sysdeps/riscv/rv64/rvv/Implies b/sysdeps/riscv/rv64/rvv/Implies
> > new file mode 100644
> > index 0000000000..b07b4cb906
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/Implies
> > @@ -0,0 +1,2 @@
> > +riscv/rv64/rvd
> > +
> > diff --git a/sysdeps/riscv/rv64/rvv/memchr.S b/sysdeps/riscv/rv64/rvv/memchr.S
> > new file mode 100644
> > index 0000000000..a7e32b8f25
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/memchr.S
> > @@ -0,0 +1,127 @@
> > +
>
> Spurious new line at the start. We also require a brief comment describing
> the file contents for newer files.
>
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
>
> Not sure 2012 range fits here.
>
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +
> > +/* Optimised memchr for riscv with vector extension
> > + * Assumptions:
> > + * - cpu becomes bandwidth limited at or before
> > + * 2 vector register sized read/write operations
> > + * + 2 scalar operations
> > + * + conditional branch
> > + */
> > +
> > +.globl memchr
> > +.type memchr,@function
> > +
> > +.align 2
> > +memchr:
>
> We have the ENTRY macro for that.
>
> > + beqz a2, .Lnot_found
>
> Maybe use the L macro here for local labels;
>
> > + csrr t1, vlenb
> > + bgeu a2, t1, .Lvector_path /* only use vector path if we're scanning
> > + at least vlenb bytes */
> > +
> > +#ifndef __riscv_strict_align
>
> Would this be defined by compiler as predefine macro or is it just a debug
> switch? If the later, I think it would be better to remove it.
>
> > + li a3, 8
> > + blt a2, a3, .Lbytewise
> > +
> > + li t1, 0x0101010101010101
> > + slli a4, t1, 7 /* a4 = 0x8080808080808080 */
> > + mul t2, a1, t1 /* entirety of t2 is now repeats of target character;
> > + assume mul is at worst no worse than 3*(shift+OR),
> > + otherwise do that instead */
> > +
> > +/*
> > + * strategy:
> > + * t4 = ((*a0) ^ t2)
> > + * - now t4 contains zero bytes if and only if next word of memory
> > + * had target character at those positions
> > + *
> > + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
> > + * - all nonzero bytes of t4 become 0; zero bytes become 0x80
> > + *
> > + * if t4 is nonzero, find the index of the byte within it, add to a0 and return
> > + * otherwise, loop
> > + */
> > +
> > +1:
> > + ld t4, (a0) /* t4 = load next 8 bytes */
> > + xor t4, t4, t2
> > + sub t5, t4, t1
> > + not t4, t4
> > + and t4, t5, t4
> > + and t4, t4, a4
> > + bnez t4, .Lbytewise /* could use ctzw, mod+lookup or just binary chop
> > + to locate byte of interest in t4 but profiling
> > + shows these approaches are at best no better */
> > + addi a2, a2, -8
> > + addi a0, a0, 8
> > + bgeu a2, a3, 1b
> > + beqz a2, .Lnot_found
> > +#endif // __riscv_strict_align
> > +
> > +/* too little data for a dword. mask calculation and branch mispredict costs
> > + make checking a word not worthwhile. degrade to bytewise search. */
> > +
> > +.Lbytewise:
> > + add t2, a0, a2
> > +
> > +1:
> > + lb t1, (a0)
> > + beq t1, a1, .Lfound
> > + addi a0, a0, 1
> > + blt a0, t2, 1b
> > +
> > +.Lnot_found:
> > + mv a0, zero
> > +.Lfound:
> > + ret
> > +
> > +.Lvector_path:
> > + vsetvli t2, a2, e8, m2, ta, ma
> > +
> > +1:
> > + vle8.v v2, (a0)
> > + vmseq.vx v0, v2, a1
> > + vfirst.m t3, v0
> > + bgez t3, .Lvec_found
> > + add a0, a0, t2
> > + sub a2, a2, t2
> > + bge a2, t2, 1b
> > + bnez a2, 2f
> > + mv a0, zero
> > + ret
> > +
> > +2:
> > + vsetvli t2, a2, e8, m2, ta, ma
> > + vle8.v v2, (a0)
> > + vmseq.vx v0, v2, a1
> > + vfirst.m t3, v0
> > + bgez t3, .Lvec_found
> > + mv a0, zero
> > + ret
> > +
> > +.Lvec_found:
> > + add a0, a0, t3
> > + ret
> > +
> > +.size memchr, .-memchr
> > +libc_hidden_builtin_def (memchr)
> > \ No newline at end of file
>
> Please add a newline.
>
> > diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S b/sysdeps/riscv/rv64/rvv/strcpy.S
> > new file mode 100644
> > index 0000000000..b21909d66f
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
> > @@ -0,0 +1,72 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>. */
> > +
> > +
>
> You can add a optimize stpcpy and use to implement strcpy on top of that
> (as my generic proposal does [1]). ARMv6 does something similar [2]
>
> [1] https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-12-adhemerval.zanella@linaro.org/ <https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-12-adhemerval.zanella@linaro.org/>
> [2] https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/arm/armv6/strcpy.S;h=e9f63a56c1c605a21b05f7ac21412585b0705171;hb=HEAD <https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/arm/armv6/strcpy.S;h=e9f63a56c1c605a21b05f7ac21412585b0705171;hb=HEAD>
>
> > +#include <sysdep.h>
> > +
> > +.globl strcpy
> > +.type strcpy,@function
> > +
> > +/*
> > + * optimized strcpy for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 2*vlenb
> > + */
> > +
> > +.align 2
> > +strcpy:
> > + mv t0, a0 /* copy dest so we can return it */
> > +
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > +
> > + addi t2, t1, -1 /* mask unaligned part of ptr */
> > + and t2, a1, t2
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search enough to align ptr */
> > + vsetvli t2, t2, e8, m2, tu, mu
> > + vle8.v v2, (a1)
> > + vmseq.vx v4, v2, zero
> > + vmsif.m v0, v4 /* copy but not past null */
> > + vfirst.m t3, v4
> > + vse8.v v2, (t0), v0.t
> > + bgez t3, .Ldone
> > + add t0, t0, t2
> > + add a1, a1, t2
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> > +
> > +1:
> > + vle8.v v2, (a1)
> > + add a1, a1, t1
> > + vmseq.vx v4, v2, zero
> > + vmsif.m v0, v4
> > + vfirst.m t3, v4
> > + vse8.v v2, (t0), v0.t
> > + add t0, t0, t1
> > + bltz t3, 1b
> > +
> > +.Ldone:
> > + ret
> > +
> > +.size strcpy, .-strcpy
> > +libc_hidden_builtin_def (strcpy)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c b/sysdeps/riscv/rv64/rvv/strcspn.c
> > new file mode 100644
> > index 0000000000..f0595a72fb
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
> > @@ -0,0 +1,22 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>. */
> > +
> > +
> > +/* strcspn is implemented in strspn.S
> > + */
> > diff --git a/sysdeps/riscv/rv64/rvv/strlen.S b/sysdeps/riscv/rv64/rvv/strlen.S
> > new file mode 100644
> > index 0000000000..c77d500693
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strlen.S
> > @@ -0,0 +1,67 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl strlen
> > +.type strlen,@function
> > +
> > +/*
> > + * optimized strlen for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 2*vlenb
> > + */
> > +
> > +.align 2
> > +strlen:
> > + mv t4, a0 /* copy of buffer start */
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > + addi t2, t1, -1 /* mask off unaligned part of ptr */
> > + and t2, a0, t2
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search fwd to align ptr */
> > + vsetvli t2, t2, e8, m2, ta, ma
> > + vle8.v v2, (a0)
> > + vmseq.vx v0, v2, zero
> > + vfirst.m t3, v0
> > + bgez t3, .Lfound
> > + add a0, a0, t2
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, ma /* search 2*vlenb bytes per pass */
> > + add t4, t4, t1
> > +
> > +1:
> > + vle8.v v2, (a0)
> > + add a0, a0, t1
> > + vmseq.vx v0, v2, zero
> > + vfirst.m t3, v0
> > + bltz t3, 1b
> > +
> > +.Lfound: /* found the 0; subtract */
> > + sub a0, a0, t4 /* buffer start from current ptr */
> > + add a0, a0, t3 /* and add offset into fetched */
> > + ret /* data to get length */
> > +
> > +.size strlen, .-strlen
> > +libc_hidden_builtin_def (strlen)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S b/sysdeps/riscv/rv64/rvv/strncmp.S
> > new file mode 100644
> > index 0000000000..863e5cb525
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
> > @@ -0,0 +1,104 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl strncmp
> > +.type strncmp,@function
> > +
> > +.align 2
> > +
> > +/* as strcmp, but with added checks on a2 (max count)
> > + */
> > +
> > +strncmp:
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > + blt a2, t1, .Ltail /* degrade if max < vlenb*2 */
> > + vsetvli zero, t1, e8, m2, ta, mu
> > + vid.v v30
> > + addi t2, t1, -1 /* mask unaligned part of ptr */
> > + and t6, a0, t2 /* unaligned part of lhs */
> > + and t5, a1, t2 /* unaligned part of rhs */
> > + sub t6, t1, t6 /* safe count to read from lhs */
> > + sub t5, t1, t5 /* same, rhs */
> > + vmsltu.vx v28, v30, t6 /* mask for first part of lhs */
> > + vmsltu.vx v26, v30, t5 /* mask for first part of rhs */
> > + vmv.v.x v16, zero
> > + vmv.v.x v18, zero
> > +
> > +
> > +1: blt a2, t1, .Ltail
> > + vmv.v.v v0, v28 /* lhs mask */
> > + vle8.v v2, (a0), v0.t /* masked load from lhs */
> > + vmseq.vx v16, v2, zero, v0.t /* check loaded bytes for null */
> > + vmv.v.v v0, v26 /* rhs mask */
> > + vfirst.m t2, v16 /* get lhs check result */
> > + bgez t2, .Ltail /* can we safely check rest */
> > + vle8.v v4, (a1), v0.t /* masked load from rhs */
> > + vmseq.vx v18, v4, zero, v0.t /* check partial rhs */
> > + vmnot.m v0, v28 /* mask for rest of lhs */
> > + vfirst.m t3, v18 /* get check result */
> > + bltz t3, 2f /* test it */
> > + bge t3, t6, .Ltail
> > +
> > + vmsleu.vx v0, v30, t3 /* select rest of string + null */
> > + vmsne.vv v0, v2, v4, v0.t /* compare */
> > + vfirst.m t3, v0
> > + bgez t3, 3f
> > + mv a0, zero
> > + ret
> > +3: add a0, a0, t3
> > + add a1, a1, t3
> > + lbu t0, (a0)
> > + lbu t1, (a1)
> > +.Ldiff:
> > + sub a0, t0, t1
> > + ret
> > +
> > + /* ...no null terminator in first part of lhs or rhs */
> > +2: vle8.v v2, (a0), v0.t /* load rest of lhs */
> > + vmnot.m v0, v26 /* mask for rest of rhs */
> > + vle8.v v4, (a1), v0.t /* load rest of rhs */
> > + vmsne.vv v0, v2, v4 /* compare */
> > + add a0, a0, t1 /* advance ptrs */
> > + vfirst.m t3, v0
> > + add a1, a1, t1
> > + sub a2, a2, t1
> > + bltz t3, 1b
> > +
> > + sub t3, t3, t1 /* found a diff but we've already advanced a0 and a1 */
> > + j 3b
> > +
> > +.Ltail:
> > + beqz a2, 1f
> > + addi a2, a2, -1
> > + lbu t0, (a0)
> > + lbu t1, (a1)
> > + bne t0, t1, .Ldiff
> > + addi a0, a0, 1
> > + addi a1, a1, 1
> > + bnez t0, .Ltail
> > +1: mv a0, zero
> > + ret
> > +
> > +
> > +.size strncmp, .-strncmp
> > +libc_hidden_builtin_def (strncmp)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S b/sysdeps/riscv/rv64/rvv/strncpy.S
> > new file mode 100644
> > index 0000000000..8b3a1e545c
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
> > @@ -0,0 +1,96 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl strncpy
> > +.type strncpy,@function
> > +
> > +/*
> > + * optimized strcpy for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 2*vlenb
> > + */
> > +
> > +.align 2
> > +strncpy:
> > + mv t0, a0 /* need to return dest so copy */
> > +
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > +
> > + addi t2, t1, -1 /* mask off unaligned part of ptr */
> > + and t2, a1, t2
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search to align the pointer */
> > + vsetvli zero, t2, e8, m2, tu, mu
> > + vle8.v v2, (a1)
> > + vmseq.vx v4, v2, zero
> > + vmsif.m v0, v4 /* copy to dest */
> > + vfirst.m t3, v4
> > + bgeu t2, a2, .Ldest_full
> > + vse8.v v2, (t0), v0.t
> > + bgez t3, .Lterminator_found
> > + add t0, t0, t2
> > + add a1, a1, t2
> > + sub a2, a2, t2
> > + beqz a2, .Ldone
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> > +
> > +1:
> > + vle8.v v2, (a1)
> > + add a1, a1, t1
> > + vmseq.vx v4, v2, zero
> > + vmsif.m v0, v4
> > + vfirst.m t3, v4
> > + bgeu t1, a2, .Ldest_full
> > + vse8.v v2, (t0), v0.t
> > + add t0, t0, t1
> > + sub a2, a2, t1
> > + bltz t3, 1b
> > + sub t0, t0, t1
> > +
> > +.Lterminator_found:
> > + addi sp, sp, -16
> > + sd ra, 0(sp)
> > + sd a0, 8(sp)
> > + add a0, t0, t3
> > + mv a1, zero
> > + sub a2, a2, t3
> > + jal ra, memset
> > + ld ra, 0(sp)
> > + ld a0, 8(sp)
> > + addi sp, sp, 16
> > +.Ldone:
> > + ret
> > +
> > +.Ldest_full:
> > + vid.v v6
> > + vmsltu.vx v4, v6, a2
> > + vmand.mm <http://vmand.mm> v0, v0, v4
> > + vse8.v v2, (t0), v0.t
> > + ret
> > +
> > +.size strncpy, .-strncpy
> > +libc_hidden_builtin_def (strncpy)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S b/sysdeps/riscv/rv64/rvv/strnlen.S
> > new file mode 100644
> > index 0000000000..6d7ee65c7a
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
> > @@ -0,0 +1,81 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
>
> Maybe use a generic implementation that issues memchr (which should be optimized
> using vector instructions) [3] ? It would be a extra function call, but it should really
> help on both code size and icache pressure.
>
> [3] https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-6-adhemerval.zanella@linaro.org/ <https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-6-adhemerval.zanella@linaro.org/>
>
> > +.globl __strnlen
> > +.type __strnlen,@function
> > +
> > +/* vector optimized strnlen
> > + * assume it's safe to read to the end of the page
> > + * containing either a null terminator or the last byte of the count or both,
> > + * but not past it
> > + * assume page size >= vlenb*2
> > + */
> > +
> > +.align 2
> > +__strnlen:
> > + mv t4, a0 /* stash a copy of start for later */
> > + beqz a1, .LzeroCount
> > +
> > + csrr t1, vlenb /* find vlenb*2 */
> > + add t1, t1, t1
> > + addi t2, t1, -1 /* mask off unaligned part of ptr */
> > + and t2, a1, a0
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search to align pointer to t1 */
> > + bgeu t2, a1, 2f /* check it's safe */
> > + mv t2, a1 /* it's not! look as far as permitted */
> > +2: vsetvli t2, t2, e8, m2, ta, ma
> > + vle8.v v2, (a0)
> > + vmseq.vx v0, v2, zero
> > + vfirst.m t3, v0
> > + bgez t3, .Lfound
> > + add a0, a0, t2
> > + sub a1, a1, t2
> > + bltu a1, t1, .LreachedCount
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, ma /* do 2*vlenb bytes per pass */
> > +
> > +1: vle8.v v2, (a0)
> > + sub a1, a1, t1
> > + vmseq.vx v0, v2, zero
> > + vfirst.m t3, v0
> > + bgez t3, .Lfound
> > + add a0, a0, t1
> > + bgeu a1, t1, 1b
> > +.LreachedCount:
> > + mv t2, a1 /* in case 0 < a1 < t1 */
> > + bnez a1, 2b /* if so, still t2 bytes to check, all safe */
> > +.LzeroCount:
> > + sub a0, a0, t4
> > + ret
> > +
> > +.Lfound: /* found the 0; subtract buffer start from current pointer */
> > + add a0, a0, t3 /* and add offset into fetched data */
> > + sub a0, a0, t4
> > + ret
> > +
> > +.size __strnlen, .-__strnlen
> > +weak_alias (__strnlen, strnlen)
> > +libc_hidden_builtin_def (__strnlen)
> > +libc_hidden_builtin_def (strnlen)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S b/sysdeps/riscv/rv64/rvv/strrchr.S
> > new file mode 100644
> > index 0000000000..4bef8a3b9c
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
> > @@ -0,0 +1,88 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>. */
> > +
> > +
> > +#include <sysdep.h>
>
> It is really worth to add a strrchr optimization? The generic implementation
> already calls strchr (which should be optimized).
>
> > +
> > +.globl strrchr
> > +.type strrchr,@function
> > +
> > +/*
> > + * optimized strrchr for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 2*vlenb
> > + */
> > +
> > +.align 2
> > +strrchr:
> > + mv t5, a0 /* stash buffer ptr somewhere safe */
> > + mv a0, zero /* result is nullptr unless we find better below */
> > +
> > + csrr t1, vlenb /* determine vlenb*2 */
> > + add t1, t1, t1
> > + addi t2, t1, -1 /* mask off unaligned part of ptr */
> > + and t2, t5, t2
> > + beqz t2, .Laligned
> > +
> > + sub t2, t1, t2 /* search to align ptr to 2*vlenb */
> > + vsetvli t2, t2, e8, m2, ta, mu
> > +
> > + vle8.v v2, (t5) /* load data into v2(,v3) */
> > + vmseq.vx v4, v2, zero /* check for null terminator */
> > + vfirst.m t4, v4 /* grab its position, if any */
> > + vmsbf.m v0, v4 /* select valid chars */
> > + vmseq.vx v0, v2, a1, v0.t /* search for candidate byte */
> > + vfirst.m t3, v0 /* grab its position, if any */
> > + bltz t3, 2f /* did we find a candidate? */
> > +
> > +3: add a0, t3, t5 /* we did! grab the address */
> > + vmsof.m v1, v0 /* there might be more than one */
> > + vmandn.mm <http://vmandn.mm> v0, v0, v1 /* so clear the one we just found */
> > + vfirst.m t3, v0 /* is there another? */
> > + bgez t3, 3b
> > +
> > +2: bgez t4, .Ldone /* did we see a null terminator? */
> > + add t5, t5, t2
> > +
> > +.Laligned:
> > + vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> > +
> > +1: vle8.v v2, (t5)
> > + vmseq.vx v4, v2, zero
> > + vfirst.m t4, v4
> > + vmsbf.m v0, v4
> > + vmseq.vx v0, v2, a1, v0.t
> > + vfirst.m t3, v0
> > + bltz t3, 2f
> > +
> > +3: add a0, t3, t5
> > + vmsof.m v1, v0
> > + vmandn.mm <http://vmandn.mm> v0, v0, v1
> > + vfirst.m t3, v0
> > + bgez t3, 3b
> > +
> > +2: add t5, t5, t1
> > + bltz t4, 1b
> > +
> > +.Ldone:
> > + ret
> > +
> > +.size strrchr, .-strrchr
> > +libc_hidden_builtin_def (strrchr)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strspn.S b/sysdeps/riscv/rv64/rvv/strspn.S
> > new file mode 100644
> > index 0000000000..2b9af5cc2d
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strspn.S
> > @@ -0,0 +1,189 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>. */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl strspn
> > +.type strspn,@function
> > +
> > +.globl strcspn
> > +.type strcspn,@function
> > +
> > +/*
> > + * optimized strspn / strcspn for riscv with vector extension
> > + * assumptions:
> > + * - vlenb is a power of 2
> > + * - page size >= 32
> > + * strategy:
> > + * - build a 256-bit table on the stack, where each elt is zero
> > + * if encountering it should terminate computation and nonzero otherwise
> > + * - use vectorised lookups into this to check 2*vlen elts at a time;
> > + * this code is identical for strspan and strcspan and can be shared
> > + *
> > + * note that while V mandates at least 128 bit wide regs,
> > + * we are building a 256 bit lookup table
> > + * therefore we use either LMUL=1 or 2 depending on what the target supports
> > + * therefore we only use even vector register numbers,
> > + * so everything still works if we go with LMUL=2
> > + */
> > +
>
> I wonder if we could adapt the generic implementation, so riscv only reimplements
> the vectorized search instead of all the boilerplace to generate the table and
> early tests.
>
> > +# -----------------------------
> > +
> > +.align 2
> > +
> > +strspn:
> > + lbu t0, 0(a1)
> > + bnez t0, .Lbuild_table
> > + mv a0, zero
> > + ret
> > +
> > +.Lbuild_table:
> > + mv a6, a0 /* store incoming a0 */
> > + li t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
> > +
> > + vsetvli zero, t1, e8, m1, tu, mu
> > +#if __riscv_v_min_vlen < 256
> > + /* we want to build a 256-bit table, so use vlenb*2,
> > + * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
> > + * 'V' extension specifies a minimum vlen of 128 so this should cover
> > + * all cases; we can skip the check if we know vlen >= 256 at compile time
> > + */
> > + csrr t2, vlenb
> > + bgeu t2, t1, 1f
> > + vsetvli zero, t1, e8, m2, tu, mu
> > +1:
> > +#endif // __riscv_v_min_vlen
> > +
> > + /* read one char from the charset at a time and write the correct bit
> > + * in the lookup table; we could do SIMD iff we ever get an extension
> > + * that provides some way of scattering bytes into a reg group
> > + */
> > + vmv.v.x v16, zero /* clear out table */
> > + vmv.v.x v8, zero /* clear out v8 */
> > + li t3, 1
> > + vmv.s.x v8, t3 /* v8 now all zeroes except bottom byte */
> > +
> > +1: vmv.v.x v2, zero /* clear out v2 */
> > + addi a1, a1, 1 /* advance charset ptr */
> > + srli t2, t0, 3 /* divide the byte we read earlier by 8 */
> > + vslideup.vx v2, v8, t2 /* v2 now 1 in the correct byte 0 elsewhere */
> > + vsll.vx v2, v2, t0 /* v2 now 1 in the correct bit, 0 elsewhere */
> > + vor.vv v16, v16, v2 /* or it in */
> > + lbu t0, 0(a1) /* fetch next bute */
> > + bnez t0, 1b /* if it's null, go round again */
> > +
> > +/*
> > + * Table is now built in v16.
> > + * Strategy:
> > + * - fetch next t1 bytes from memory
> > + * - vrgather on their values divided by 8 to get relevant bytes of table
> > + * - shift right to get the correct bit into bit 1
> > + * - and with 1, compare with expected terminator value, then check mask
> > + * to see if we've found a terminator
> > + *
> > + * Before we can begin, a0 needs to be t1-aligned, so that when we fetch
> > + * the next t1 bytes - any of which may be the null terminator -
> > + * we do not cross a page boundary and read unmapped memory. Therefore
> > + * we have one read of however many bytes are needed to align a0,
> > + * before the main loop.
> > + */
> > +
> > +.Lscan_table:
> > + vmv.v.x v8, t3 /* v8 now t1 bytes of 0x01 */
> > +
> > + and t2, a0, t1 /* mask to align to t1 */
> > + beqz t2, 2f /* or skip if we're already aligned */
> > + sub t2, t1, t2 /* t2 now bytes to read to align to t1 */
> > +
> > + vid.v v2 /* build mask instead of changing vl */
> > + vmsltu.vx v0, v2, t2 /* so we don't need to track LMUL */
> > +
> > + vle8.v v2, (a0), v0.t /* load next bytes from input */
> > + vsrl.vi <http://vsrl.vi> v4, v2, 3 /* divide by 8 */
> > + vrgather.vv v6, v16, v4 /* corresponding bytes of bit table */
> > + vsrl.vv v6, v6, v2 /* shift correct bits to lsb */
> > + vand.vv v6, v6, v8 /* and with 1 to complete the lookups */
> > + vmseq.vx v4, v6, zero, v0.t /* check to see if any 0s are present */
> > + vfirst.m t0, v4 /* index of the first 0, if any */
> > + bgez t0, .Lscan_end /* if we found one, stop */
> > + add a0, a0, t2 /* advance by number of bytes we read */
> > +
> > +2: add a6, a6, t1 /* we'll advance a0 before the exit check */
> > +1: vle8.v v2, (a0) /* as above but unmasked so t1 elts per pass */
> > + add a0, a0, t1
> > +
> > + vsrl.vi <http://vsrl.vi> v4, v2, 3
> > + vrgather.vv v6, v16, v4
> > + vsrl.vv v6, v6, v2
> > + vand.vv v6, v6, v8
> > +
> > + vmseq.vx v4, v6, zero
> > + vfirst.m t0, v4
> > + bltz t0, 1b
> > +
> > +.Lscan_end:
> > + add a0, a0, t0 /* calculate offset to terminating byte */
> > + sub a0, a0, a6
> > + ret
> > +.size strspn, .-strspn
> > +
> > +/* strcspn
> > + *
> > + * table build exactly as for strspn, except:
> > + * - the lookup table starts with all bits except bit 0 of byte 0 set
> > + * - we clear the corresponding bit for each byte in the charset
> > + * once table is built, we can reuse the scan code directly
> > + */
> > +
> > +strcspn:
> > + lbu t0, 0(a1)
> > + beqz t0, strlen /* no rejections -> prefix is whole string */
> > +
> > + mv a6, a0
> > + li t1, 32
> > +
> > + vsetvli zero, t1, e8, m1, tu, mu
> > +#if __riscv_v_min_vlen < 256
> > + csrr t2, vlenb
> > + bgeu t2, t1, 1f
> > + vsetvli zero, t1, e8, m2, tu, mu
> > +1:
> > +#endif // __riscv_v_min_vlen
> > +
> > + vmv.v.x v8, zero
> > + li t3, 1 /* all bits clear except bit 0 of byte 0 */
> > + vmv.s.x v8, t3
> > + vnot.v v16, v8 /* v16 is the inverse of that */
> > + li t4, -1
> > +
> > +1: vmv.v.x v2, zero
> > + addi a1, a1, 1 /* advance charset ptr */
> > + srli t2, t0, 3 /* select correct bit in v2 */
> > + vslideup.vx v2, v8, t2
> > + vsll.vx v2, v2, t0
> > + vnot.v v2, v2 /* invert */
> > + vand.vv v16, v16, v2 /* clear the relevant bit of table */
> > + lbu t0, 0(a1)
> > + bnez t0, 1b
> > + j .Lscan_table
> > +.size strcspn, .-strcspn
> > +
> > +libc_hidden_builtin_def (strspn)
> > +libc_hidden_builtin_def (strcspn)
> > \ No newline at end of file
>
>
> I think it would be better to provide vectorized mem* and
str* that work indendently of the compiler option used.
>
A generic vectorized mem*/str* with no scalar fallback has no issues with
alignment and is actually smaller and simpler code as well. The awkwardness
here is performance of very small operations, which are a significant
portion of the calls to these functions in practice in the wild: for
operations much smaller than the vector length, a scalar implementation is
faster - but this is only true if it either makes no unaligned accesses, or
unaligned accesses are permitted and reasonably performant on the target,
which (as others have mentioned here) may not be the case on RISCV; and
there is a limit to how much we can check each invocation without paying
more for the checks than we save. RISCV vector length, though, may be quite
large, so basing the tradeoff on that with fallback to a scalar loop that
just processes a byte per iteration may also be prohibitive.
Using ifuncs would, of course, provide a way to address this once the
required support / plumbing is in place. I'll look at shelving
the microoptimisations until then and sticking to more generic code here.
Using the newly visible OP_T_THRES from your patchset may be the way
forward in the interim.
> Another option, which I will add on my default string refactor, it to use
> strlen plus memrchr:
>
> char *strrchr (const char *s, int c)
> {
> return __memrchr (s, c, strlen(s) + 1);
> }
>
> It would only 2 function calls and if the architecture provides optimized
> strlen and memrchr, the performance overhead should be only the additional
> functions call (which the advantage of less icache pressure).
>
This approach still means you're reading the entire s buffer twice in the
worst case: once to find the end, then again to scan it for c. It's not the
end of the world - often, s will be small and c present, and s will be in
cache for the second read, so the tradeoff is arguably less clear than
what's in the generic code now. I'll see if I can gather some more info on
usage in the wild.
> I recall that I tested using a 256-bit bitfield instead of 256-byte table,
> but
> it incured in some overhead on most architecture (I might check again).
>
One option might to parametrize both the table generation and the table
> search,
>
I'm using performance measurements here, of course; this sort of tradeoff
might be another one for ifuncs or even selected at runtime. What I suggest
might be slightly unfortunate here, though, is the creation of an internal
glibc api that forces one particular design choice on all platforms in all
situations.
Note that the cost of the table generation step is almost as important as
the cost of the scan - e.g. strtok() is implemented using these and
generally called in a tight loop by code in the wild; use of that directly
or similar patterns during parsing/tokenization means these functions are
typically invoked many times, frequently, and the standard library provides
no way for the table to be reused the table between the calls even though
the accept/reject chars remain the same. So it is likely that people
optimising glibc for different platforms will still want to provide
optimized paths for the table generation even if it is factored out into a
separate module.
In general, I suggest caution with tradeoffs between function calls and
code reuse: on modern superscalar architectures, the cost of a mispredicted
branch can be huge in terms of the number of operations that could
otherwise get retired, and although the function invocation and return
themselves are completely predictable, each of these functions contains a
loop with an end condition that is data driven, so essentially random and
all but guaranteeing a mispredict per call in practice; folding
functionality into a single loop (for tiny pieces of code like these, for
small inputs) is a noticeable win over two calls each with its own loop.
On Thu, Feb 2, 2023 at 3:20 PM Sergei Lewis <slewis@rivosinc.com> wrote:
> I think it would be better to provide vectorized mem* and
>
> str* that work indendently of the compiler option used.
>>
>
> A generic vectorized mem*/str* with no scalar fallback has no issues with
> alignment and is actually smaller and simpler code as well. The awkwardness
> here is performance of very small operations, which are a significant
> portion of the calls to these functions in practice in the wild: for
> operations much smaller than the vector length, a scalar implementation is
> faster - but this is only true if it either makes no unaligned accesses, or
> unaligned accesses are permitted and reasonably performant on the target,
> which (as others have mentioned here) may not be the case on RISCV; and
> there is a limit to how much we can check each invocation without paying
> more for the checks than we save. RISCV vector length, though, may be quite
> large, so basing the tradeoff on that with fallback to a scalar loop that
> just processes a byte per iteration may also be prohibitive.
>
> Using ifuncs would, of course, provide a way to address this once the
> required support / plumbing is in place. I'll look at shelving
> the microoptimisations until then and sticking to more generic code here.
> Using the newly visible OP_T_THRES from your patchset may be the way
> forward in the interim.
>
>
>> Another option, which I will add on my default string refactor, it to use
>> strlen plus memrchr:
>>
>> char *strrchr (const char *s, int c)
>> {
>> return __memrchr (s, c, strlen(s) + 1);
>> }
>>
>> It would only 2 function calls and if the architecture provides optimized
>> strlen and memrchr, the performance overhead should be only the additional
>> functions call (which the advantage of less icache pressure).
>>
>
> This approach still means you're reading the entire s buffer twice in the
> worst case: once to find the end, then again to scan it for c. It's not the
> end of the world - often, s will be small and c present, and s will be in
> cache for the second read, so the tradeoff is arguably less clear than
> what's in the generic code now. I'll see if I can gather some more info on
> usage in the wild.
>
>
>> I recall that I tested using a 256-bit bitfield instead of 256-byte
>> table, but
>> it incured in some overhead on most architecture (I might check again).
>>
> One option might to parametrize both the table generation and the table
>> search,
>>
>
> I'm using performance measurements here, of course; this sort of tradeoff
> might be another one for ifuncs or even selected at runtime. What I suggest
> might be slightly unfortunate here, though, is the creation of an internal
> glibc api that forces one particular design choice on all platforms in all
> situations.
>
> Note that the cost of the table generation step is almost as important as
> the cost of the scan - e.g. strtok() is implemented using these and
> generally called in a tight loop by code in the wild; use of that directly
> or similar patterns during parsing/tokenization means these functions are
> typically invoked many times, frequently, and the standard library provides
> no way for the table to be reused the table between the calls even though
> the accept/reject chars remain the same. So it is likely that people
> optimising glibc for different platforms will still want to provide
> optimized paths for the table generation even if it is factored out into a
> separate module.
>
On 2/1/23 11:03, Andrew Waterman wrote:
>> +#ifndef __riscv_strict_align
> strict-align is not the right thing to check here. As the RVA profile
> document explains, all RVA-compliant implementations must support
> misaligned loads and stores (so strict-align will be false), but they
> might execute extremely slowly (e.g., via trap and emulate), and so
> this approach will unduly penalize some implementations.
FWIW, the proposed __riscv_strict_align if generated can have 2 possible
values:
- 1 (explicit -mstrict-align used in build)
- 2 cpu tune param indicated unaligned access is slow (like with
trap-n-emulate)
So if theory code can still be written to cater to that.
-Vineet
On Thu, Feb 2, 2023 at 4:13 PM Vineet Gupta <vineetg@rivosinc.com> wrote:
>
>
>
> On 2/1/23 11:03, Andrew Waterman wrote:
> >> +#ifndef __riscv_strict_align
> > strict-align is not the right thing to check here. As the RVA profile
> > document explains, all RVA-compliant implementations must support
> > misaligned loads and stores (so strict-align will be false), but they
> > might execute extremely slowly (e.g., via trap and emulate), and so
> > this approach will unduly penalize some implementations.
>
> FWIW, the proposed __riscv_strict_align if generated can have 2 possible
> values:
> - 1 (explicit -mstrict-align used in build)
> - 2 cpu tune param indicated unaligned access is slow (like with
> trap-n-emulate)
Yeah, those semantics make sense. It makes the "strict" name a little
misleading, though: "strict" suggests to me that misaligned accesses
are outright illegal. So, it might be better to pick another name,
e.g. __riscv_avoid_misaligned.
>
> So if theory code can still be written to cater to that.
It had better cater to it in practice as well as in theory. Standard
binary distributions need to be suitably generic, and they need to
heed the guidance in the RVA profile spec. So this is OK as long as
the default continues to be to avoid misaligned accesses. The fact
that GCC's default -mtune setting is to mark them as slow means this
is probably OK.
This may be Yet Another IFUNC Case: use misaligned accesses only if
known at runtime that they are fast, or when there's a routine
available that's optimized for a specific microarchitecture.
(Microarchitecture-specific IFUNCs would probably be more appropriate
for these routines, anyway, since they're pretty clearly tuned for a
particular machine. For example, the trivial 7-instruction memcpy
loop recommended in the specification will perform well across a
broader range of vector machines.)
>
> -Vineet
On 02/02/23 12:20, Sergei Lewis wrote:
> I think it would be better to provide vectorized mem* and
>
> str* that work indendently of the compiler option used.
>
>
> A generic vectorized mem*/str* with no scalar fallback has no issues with alignment and is actually smaller and simpler code as well. The awkwardness here is performance of very small operations, which are a significant portion of the calls to these functions in practice in the wild: for operations much smaller than the vector length, a scalar implementation is faster - but this is only true if it either makes no unaligned accesses, or unaligned accesses are permitted and reasonably performant on the target, which (as others have mentioned here) may not be the case on RISCV; and there is a limit to how much we can check each invocation without paying more for the checks than we save. RISCV vector length, though, may be quite large, so basing the tradeoff on that with fallback to a scalar loop that just processes a byte per iteration may also be prohibitive.
>
> Using ifuncs would, of course, provide a way to address this once the required support / plumbing is in place. I'll look at shelving the microoptimisations until then and sticking to more generic code here. Using the newly visible OP_T_THRES from your patchset may be the way forward in the interim.
Yes, this is similar to all other architecture that provides vector/simd
instructions (aarch64 SVE for instance uses a similar strategy). That's
not the issue in fact, the issues is having an extra compiler define
flag that affects binary distribution and incurs in extra maintenance
that is not tied to ABIs definition. It means that we will need to also
build/check for RVV with __riscv_strict_align.
Is there a meaningful performance difference on always using the
__riscv_strict_align code path to have a implementation that works whatever
the chip has fast unaligned memory access support?
The VLEN arbitrary upper bound and page size limit is also worrisome, as
Andrew has pointed out. I would prefer to either have a generic implementation
that works without such limits (or assumes something more sane, like aarch64
with unaligned access that expects the minimum support 4k page size), or have
a way to fallback to default implementation if the criteria is not met.
>
>
> Another option, which I will add on my default string refactor, it to use
> strlen plus memrchr:
>
> char *strrchr (const char *s, int c)
> {
> return __memrchr (s, c, strlen(s) + 1);
> }
>
> It would only 2 function calls and if the architecture provides optimized
> strlen and memrchr, the performance overhead should be only the additional
> functions call (which the advantage of less icache pressure).
>
>
> This approach still means you're reading the entire s buffer twice in the worst case: once to find the end, then again to scan it for c. It's not the end of the world - often, s will be small and c present, and s will be in cache for the second read, so the tradeoff is arguably less clear than what's in the generic code now. I'll see if I can gather some more info on usage in the wild.
Yes, ideally we will do the strrchr on only one pass over the string. But the
idea here is really try to use composability, specially when the symbol usage
has a very low probability to be a hotspot.
>
>
> I recall that I tested using a 256-bit bitfield instead of 256-byte table, but
> it incured in some overhead on most architecture (I might check again).
>
> One option might to parametrize both the table generation and the table search,
>
>
> I'm using performance measurements here, of course; this sort of tradeoff might be another one for ifuncs or even selected at runtime. What I suggest might be slightly unfortunate here, though, is the creation of an internal glibc api that forces one particular design choice on all platforms in all situations.
I see the other way around, where it makes maintenance and remove complexity. To
give you an example, many memchr implementations uses a strategy to first calculate
the final address using the input address and size. And some failed to take in
consideration that it might overflow [1]. We had to fixed on multiple
implementations, where if we have a parametrized implementation we could just fix
on the default one.
Of course where the architecture optimization does not fit on the generic
framework, it is up to arch-maintainers to use a specialized one. But the generic
implementation I proposing was used over the year by multiple architectures
which generates multiple arch-specific implementations.
[1] https://sourceware.org/bugzilla/show_bug.cgi?id=21182
>
> Note that the cost of the table generation step is almost as important as the cost of the scan - e.g. strtok() is implemented using these and generally called in a tight loop by code in the wild; use of that directly or similar patterns during parsing/tokenization means these functions are typically invoked many times, frequently, and the standard library provides no way for the table to be reused the table between the calls even though the accept/reject chars remain the same. So it is likely that people optimising glibc for different platforms will still want to provide optimized paths for the table generation even if it is factored out into a separate module.
I think using a bitfield for the table might be profitable for tight
called loops as you suggest, I will evaluate the usage on generic code.
But again, my idea is try to parametrize only what the
architecture need to optimize, and even if compiler improves (through
autovectorization or other optimization pass) to avoid the need to such
arch-specific implementations.
> > Is there a meaningful performance difference on always using the
> __riscv_strict_align code path
Minimum VLEN is 16 bytes, so for random operations we will average 8
iterations of a bytewise loop vs 2 of a word-oriented one.
That said, I'm currently working on a v2 patch that removes the scalar
fallback entirely - over a suite of random operation sizes, dropping the
word-based loop is more expensive than just not having the fallback at all.
So these patterns will all go away and the code will look much more like
what's in the ISA manual.
> The VLEN arbitrary upper bound and page size limit is also worrisome, as
> Andrew has pointed out. I would prefer to either have a generic
> implementation
> that works without such limits
>
I realise I have not responded there yet - I'm certainly not ignoring this,
but investigating options before I commit; e.g. another option might be to
gate the affected code behind a compile time vlen check, and use fault only
first loads as Andrew suggests where there was not enough information
provided at compile time to prove the approach is safe.
These decisions will all become much more straightforward with ifunc
support - a generic version for the most common situation and runtime
selection of more specific versions would resolve all these issues and also
open the gates for people working on widely different implementations to
easily provide their own versions of as many or as few of these functions
as needed - and I do expect there will be a number of these, since the
architecture is super flexible and the ecosystem already looking quite
fragmented. Accordingly, I am also investigating what will be involved in
getting ifuncs support in place.
FWIW it turns out that IFUNC resolution is supported by gcc for riscv
targets where all the underlying tools support them (which, in current
riscv-gnu-toolchain, they do). Support is detected and enabled here:
https://github.com/gcc-mirror/gcc/blob/master/gcc/configure.ac#L3027
Having a stable finalised kernel/user interface for caps would help, but
for this purpose a few CSR reads should hopefully be enough to make sane
decisions.
new file mode 100644
@@ -0,0 +1,2 @@
+riscv/rv64/rvd
+
new file mode 100644
@@ -0,0 +1,127 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+
+/* Optimised memchr for riscv with vector extension
+ * Assumptions:
+ * - cpu becomes bandwidth limited at or before
+ * 2 vector register sized read/write operations
+ * + 2 scalar operations
+ * + conditional branch
+ */
+
+.globl memchr
+.type memchr,@function
+
+.align 2
+memchr:
+ beqz a2, .Lnot_found
+ csrr t1, vlenb
+ bgeu a2, t1, .Lvector_path /* only use vector path if we're scanning
+ at least vlenb bytes */
+
+#ifndef __riscv_strict_align
+ li a3, 8
+ blt a2, a3, .Lbytewise
+
+ li t1, 0x0101010101010101
+ slli a4, t1, 7 /* a4 = 0x8080808080808080 */
+ mul t2, a1, t1 /* entirety of t2 is now repeats of target character;
+ assume mul is at worst no worse than 3*(shift+OR),
+ otherwise do that instead */
+
+/*
+ * strategy:
+ * t4 = ((*a0) ^ t2)
+ * - now t4 contains zero bytes if and only if next word of memory
+ * had target character at those positions
+ *
+ * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
+ * - all nonzero bytes of t4 become 0; zero bytes become 0x80
+ *
+ * if t4 is nonzero, find the index of the byte within it, add to a0 and return
+ * otherwise, loop
+ */
+
+1:
+ ld t4, (a0) /* t4 = load next 8 bytes */
+ xor t4, t4, t2
+ sub t5, t4, t1
+ not t4, t4
+ and t4, t5, t4
+ and t4, t4, a4
+ bnez t4, .Lbytewise /* could use ctzw, mod+lookup or just binary chop
+ to locate byte of interest in t4 but profiling
+ shows these approaches are at best no better */
+ addi a2, a2, -8
+ addi a0, a0, 8
+ bgeu a2, a3, 1b
+ beqz a2, .Lnot_found
+#endif // __riscv_strict_align
+
+/* too little data for a dword. mask calculation and branch mispredict costs
+ make checking a word not worthwhile. degrade to bytewise search. */
+
+.Lbytewise:
+ add t2, a0, a2
+
+1:
+ lb t1, (a0)
+ beq t1, a1, .Lfound
+ addi a0, a0, 1
+ blt a0, t2, 1b
+
+.Lnot_found:
+ mv a0, zero
+.Lfound:
+ ret
+
+.Lvector_path:
+ vsetvli t2, a2, e8, m2, ta, ma
+
+1:
+ vle8.v v2, (a0)
+ vmseq.vx v0, v2, a1
+ vfirst.m t3, v0
+ bgez t3, .Lvec_found
+ add a0, a0, t2
+ sub a2, a2, t2
+ bge a2, t2, 1b
+ bnez a2, 2f
+ mv a0, zero
+ ret
+
+2:
+ vsetvli t2, a2, e8, m2, ta, ma
+ vle8.v v2, (a0)
+ vmseq.vx v0, v2, a1
+ vfirst.m t3, v0
+ bgez t3, .Lvec_found
+ mv a0, zero
+ ret
+
+.Lvec_found:
+ add a0, a0, t3
+ ret
+
+.size memchr, .-memchr
+libc_hidden_builtin_def (memchr)
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,93 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+/* Optimised memcmp for riscv with vector extension
+ */
+
+.globl memcmp
+.type memcmp,@function
+
+.align 2
+
+memcmp:
+ mv t2, zero
+ beqz a2, .Ldone
+
+ li t1, 5 /* scalar path cheaper for 1-4 elts */
+ bltu a2, t1, .Lscalar
+
+ /* main loop, vlenb*2 elts at a time */
+ vsetvli t1, a2, e8, m2, ta, ma
+
+1:
+ vle8.v v2, (a0) /* load elts */
+ vle8.v v4, (a1)
+ vmsne.vv v0, v2, v4 /* compare */
+ vfirst.m t3, v0
+ bgez t3, .Lvec_diff /* found a difference ? */
+ add a0, a0, t1 /* not yet, advance everything */
+ add a1, a1, t1
+ sub a2, a2, t1
+ bgeu a2, t1, 1b
+
+ bnez a2, .Ltail
+ mv a0, zero
+ ret
+
+.Ltail:
+ /* handle tail. we know a2 < vlenb*2 so just load and compare the lot */
+ vsetvli t1, a2, e8, m2, ta, ma
+ vle8.v v2, (a0)
+ vle8.v v4, (a1)
+ vmsne.vv v0, v2, v4
+ vfirst.m t3, v0
+ bgez t3, .Lvec_diff
+ mv a0, zero /* no diff found */
+ ret
+
+.Lvec_diff: /* v2, v4 differ at elt t3 */
+ add a0, a0, t3
+ add a1, a1, t3
+ lbu t0, (a0)
+ lbu t1, (a1)
+ sub a0, t0, t1
+ ret
+
+.Lscalar:
+ add t3, a0, a2
+
+1:
+ lbu t0, (a0)
+ lbu t1, (a1)
+ sub t2, t0, t1
+ bnez t2, .Ldone
+ addi a0, a0, 1
+ addi a1, a1, 1
+ bltu a0, t3, 1b
+
+.Ldone:
+ mv a0, t2
+ ret
+
+
+.size memcmp, .-memcmp
+libc_hidden_builtin_def (memcmp)
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,154 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+/* Optimised memcpy and memmove for riscv with vector extension
+ */
+
+.globl memcpy
+.type memcpy,@function
+.globl memmove
+.type memmove,@function
+
+.align 2
+memmove:
+ bge a0, a1, .Lmemcpy_rev
+
+memcpy:
+.Lmemcpy_fwd:
+ mv t0, a0 /* t0 = preserve a0 so we can return it */
+ csrr t2, vlenb /* t2 = number of bytes per vectorised copy op */
+ slli t5, t2, 1 /* t5 = number of bytes per loop */
+ addi t3, t5, -1 /* generate mask */
+ not t4, t3
+ and t4, a2, t4 /* t4 = bytes copied in vectorised pass */
+
+ beqz t4, .Lscalar_fwd /* size too small for even one pass? */
+
+ and a2, a2, t3 /* a2 = bytes still left to copy after pass */
+ add t4, t4, a1 /* t4 = src at end of vectorised pass */
+
+1:
+ vl2r.v v2, (a1) /* load, advance source */
+ add a1, a1, t5
+ vs2r.v v2, (t0) /* store, advance dest */
+ add t0, t0, t5
+ bltu a1, t4, 1b /* src at end? */
+
+ bltu a2, t2, .Lscalar_fwd /* should we do one more vec load/store? */
+ vl1r.v v2, (a1)
+ sub a2, a2, t2
+ add a1, a1, t2
+ vs1r.v v2, (t0)
+ add t0, t0, t2
+
+.Lscalar_fwd:
+ bnez a2, .Lnobail
+.Lbail:
+ ret
+.Lnobail:
+
+#ifndef __riscv_strict_align
+ addi t2, zero, 4
+ bltu a2, t2, .Lsingle_bytes
+1:
+ lw t3, 0(a1)
+ addi a1, a1, 4
+ sw t3, 0(t0)
+ addi t0, t0, 4
+ addi a2, a2, -4
+ bgeu a2, t2, 1b
+#endif // __riscv_strict_align
+
+.Lsingle_bytes:
+ beqz a2, .Lbail
+ add a2, a2, a1 /* a2 = src + remaining size */
+1:
+ lb t1, 0(a1)
+ sb t1, 0(t0)
+ addi a1, a1, 1
+ addi t0, t0, 1
+ bltu a1, a2, 1b
+ ret
+.size memcpy, .-memcpy
+
+
+.Lmemcpy_rev:
+ beq a0, a1, .Lmemcpy_rev_done
+ add t0, a0, a2 /* t0 = dest so we can return a0=dest later */
+ add t6, a1, a2 /* dest and src both point to byte */
+ /* immediately after end of buffer */
+
+ csrr t2, vlenb /* t2 = number of bytes per pass */
+ slli t5, t2, 1 /* t5 = number of bytes per entire loop */
+ addi t3, t5, -1 /* t3 = (bytes per loop) mask */
+ not t4, t3 /* generate mask for bytes processed by loop */
+ and t4, a2, t4 /* t4 = bytes copied in vectorised pass */
+
+ beqz t4, .Lscalar_rev /* size too small for even one pass? */
+
+ and a2, a2, t3 /* a2 = bytes still left to copy after pass */
+ sub t4, t6, t4 /* t4 = src at end of vectorised pass */
+
+1:
+ sub t6, t6, t5
+ sub t0, t0, t5
+ vl2r.v v2, (t6) /* load, advance source */
+ vs2r.v v2, (t0) /* store, advance dest */
+ bgtu t6, t4, 1b /* src at end? */
+
+ bltu a2, t2, .Lscalar_rev /* should we do one more vec load/store? */
+ sub t6, t6, t2
+ sub t0, t0, t2
+ sub a2, a2, t2
+ vl1r.v v2, (t6)
+ vs1r.v v2, (t0)
+
+.Lscalar_rev:
+#ifndef __riscv_strict_align
+ beqz a2, .Lbail
+
+ addi t2, zero, 4
+ bltu a2, t2, 2f
+1:
+ addi t6, t6, -4
+ addi t0, t0, -4
+ addi a2, a2, -4
+ lw t3, 0(t6)
+ sw t3, 0(t0)
+ bgeu a2, t2, 1b
+2:
+#endif // __riscv_strict_align
+
+ beqz a2, .Lbail
+1:
+ addi t6, t6, -1
+ addi t0, t0, -1
+ lb t1, 0(t6)
+ sb t1, 0(t0)
+ bgtu t0, a0, 1b
+
+.Lmemcpy_rev_done:
+ ret
+
+.size memmove, .-memmove
+libc_hidden_builtin_def (memcpy)
+libc_hidden_builtin_def (memmove)
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,22 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+/* memmove is implemented in memcpy.S
+ */
new file mode 100644
@@ -0,0 +1,89 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+
+/* Optimised memset for riscv with vector extension
+ */
+
+.globl memset
+.type memset,@function
+
+.align 2
+memset:
+ mv t0, a0 /* t0 = dest so we can return a0 later */
+ vsetvli t2, a2, e8, m2, ta, ma /* t2 = elts per copy */
+ beqz t2, .Lscalar
+
+ vmv.v.x v2, a1 /* splat value across v2 */
+
+ slli t3, t2, 1
+ bgtu t3, a2, .Lsinglestore
+
+1:
+ vse8.v v2, (t0)
+ add t0, t0, t2
+ vse8.v v2, (t0)
+ add t0, t0, t2
+ sub a2, a2, t3
+ bgeu a2, t3, 1b
+ bgeu a2, t2, .Lsinglestore
+ bnez a2, .Lscalar
+
+.Lbail:
+ ret
+
+.Lsinglestore:
+ bgtu t2, a2, .Lscalar
+ vse8.v v2, (t0)
+ add t0, t0, t2
+ sub a2, a2, t2
+
+.Lscalar:
+ beqz a2, .Lbail
+
+#ifndef __riscv_strict_align
+ slli t2, a1, 8
+ or a1, a1, t2
+ slli t2, a1, 16
+ or a1, a1, t2
+
+ addi t2, zero, 4
+ bltu a2, t2, 2f
+
+1:
+ sw a1, 0(t0)
+ addi t0, t0, 4
+ addi a2, a2, -4
+ bgeu a2, t2, 1b
+2:
+ beqz a2, .Lbail
+#endif // __riscv_strict_align
+
+ add a2, a2, t0
+1:
+ sb a1, 0(t0)
+ addi t0, t0, 1
+ bltu t0, a2, 1b
+ ret
+
+.size memset, .-memset
+libc_hidden_builtin_def (memset)
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,92 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+.globl strchr
+.type strchr,@function
+
+.globl __strchrnul
+.type __strchrnul,@function
+
+/*
+ * optimized strchr for riscv with vector extension
+ * assumptions:
+ * - vlenb is a power of 2
+ * - page size >= 2*vlenb
+ */
+
+.align 2
+__strchrnul:
+ li t5, -1
+ j 1f
+
+strchr:
+ mv t5, zero
+1: csrr t1, vlenb /* find vlenb*2 */
+ add t1, t1, t1
+ addi t2, t1, -1 /* mask off unaligned part of pointer */
+ and t2, a0, t2
+ beqz t2, .Laligned
+
+ sub t2, t1, t2 /* search however many bytes
+ are needed to align the pointer */
+ vsetvli t2, t2, e8, m2, ta, mu
+
+ vle8.v v2, (a0) /* load data into v2(,v3) */
+ vmseq.vx v4, v2, zero
+ vfirst.m t4, v4
+ vmsbf.m v0, v4
+ vmseq.vx v0, v2, a1, v0.t
+ vfirst.m t3, v0
+ bgez t3, .Lfound
+ bgez t4, .Lbufferend
+ add a0, a0, t2
+
+.Laligned:
+ vsetvli zero, t1, e8, m2, ta, mu
+ li t4, -1
+
+1:
+ vle8.v v2, (a0)
+ vmseq.vx v4, v2, zero
+ vfirst.m t4, v4
+ vmsbf.m v0, v4
+ vmseq.vx v0, v2, a1, v0.t
+ vfirst.m t3, v0
+ bgez t3, .Lfound
+ bgez t4, .Lbufferend
+ add a0, a0, t1
+ j 1b
+
+.Lfound: /* found the target at a0+t3 */
+ add a0, a0, t3
+ ret
+
+.Lbufferend:
+ add a0, a0, t4
+ and a0, a0, t5
+ ret
+
+.size strchr, .-strchr
+.size __strchrnul, .-__strchrnul
+
+libc_hidden_builtin_def (strchr)
+weak_alias (__strchrnul, strchrnul)
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,22 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+/* strchrnul is implemented in strchr.S
+ */
new file mode 100644
@@ -0,0 +1,108 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+.globl strcmp
+.type strcmp,@function
+
+.align 2
+
+/* most of the time, one or both sides is unaligned and their alignments differ
+ * we need to check for a null terminator before crossing a page boundary
+ * strategy:
+ * - for each side, calculate masks for alignment and (vlenb * 2) - alignment
+ * - while no difference encountered:
+ * - for each side:
+ * - load bytes to end of next vlenb*2 block
+ * - check for null terminator
+ * - if no terminator, load bytes to fill rest of register
+ * - compare sides
+ */
+
+strcmp:
+ csrr t1, vlenb /* find vlenb*2 */
+ add t1, t1, t1
+ vsetvli zero, t1, e8, m2, ta, mu
+ vid.v v30
+ addi t2, t1, -1 /* mask for unaligned part of ptr */
+ and t6, a0, t2 /* unaligned part of lhs */
+ and t5, a1, t2 /* unaligned part of rhs */
+ sub t6, t1, t6 /* safe number of lhs bytes to read */
+ sub t5, t1, t5 /* same, rhs */
+ vmsltu.vx v28, v30, t6 /* v28 = mask for first half of lhs load */
+ vmsltu.vx v26, v30, t5 /* v26 = mask for first half of rhs load */
+ vmv.v.x v16, zero
+ vmv.v.x v18, zero
+
+1: vmv.v.v v0, v28 /* lhs mask */
+ vle8.v v2, (a0), v0.t /* masked load from lhs */
+ vmseq.vx v16, v2, zero, v0.t /* check loaded bytes for null */
+ vmv.v.v v0, v26 /* rhs mask */
+ vfirst.m t2, v16 /* get lhs check result */
+ bgez t2, .Ltail /* bail if we can't safely check rest */
+ vle8.v v4, (a1), v0.t /* masked load from rhs */
+ vmseq.vx v18, v4, zero, v0.t /* check partial rhs for null */
+ vmnot.m v0, v28 /* mask for rest of lhs */
+ vfirst.m t3, v18 /* get check result */
+ bltz t3, 2f /* test it */
+ /* we see null terminator */
+ bge t3, t6, .Ltail /* have enough bytes for vector cmp? */
+
+ vmsleu.vx v0, v30, t3 /* select rest + null */
+ vmsne.vv v0, v2, v4, v0.t /* compare */
+ vfirst.m t3, v0
+ bgez t3, 3f
+ mv a0, zero /* no difference */
+ ret
+3: add a0, a0, t3
+ add a1, a1, t3
+ lbu t0, (a0)
+ lbu t1, (a1)
+.Ldiff:
+ sub a0, t0, t1
+ ret
+
+ /* ...no null terminator */
+2: vle8.v v2, (a0), v0.t /* load rest of lhs */
+ vmnot.m v0, v26 /* mask for rest of rhs */
+ vle8.v v4, (a1), v0.t /* load rest of rhs */
+ vmsne.vv v0, v2, v4 /* compare */
+ add a0, a0, t1 /* advance ptrs */
+ vfirst.m t3, v0
+ add a1, a1, t1
+ bltz t3, 1b
+
+ sub t3, t3, t1 /* found difference but we've already advanced a0 and a1 */
+ j 3b
+
+.Ltail:
+ lbu t0, (a0)
+ lbu t1, (a1)
+ bne t0, t1, .Ldiff
+ addi a0, a0, 1
+ addi a1, a1, 1
+ bnez t0, .Ltail
+ mv a0, zero
+ ret
+
+
+.size strcmp, .-strcmp
+libc_hidden_builtin_def (strcmp)
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,72 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+.globl strcpy
+.type strcpy,@function
+
+/*
+ * optimized strcpy for riscv with vector extension
+ * assumptions:
+ * - vlenb is a power of 2
+ * - page size >= 2*vlenb
+ */
+
+.align 2
+strcpy:
+ mv t0, a0 /* copy dest so we can return it */
+
+ csrr t1, vlenb /* find vlenb*2 */
+ add t1, t1, t1
+
+ addi t2, t1, -1 /* mask unaligned part of ptr */
+ and t2, a1, t2
+ beqz t2, .Laligned
+
+ sub t2, t1, t2 /* search enough to align ptr */
+ vsetvli t2, t2, e8, m2, tu, mu
+ vle8.v v2, (a1)
+ vmseq.vx v4, v2, zero
+ vmsif.m v0, v4 /* copy but not past null */
+ vfirst.m t3, v4
+ vse8.v v2, (t0), v0.t
+ bgez t3, .Ldone
+ add t0, t0, t2
+ add a1, a1, t2
+
+.Laligned:
+ vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
+
+1:
+ vle8.v v2, (a1)
+ add a1, a1, t1
+ vmseq.vx v4, v2, zero
+ vmsif.m v0, v4
+ vfirst.m t3, v4
+ vse8.v v2, (t0), v0.t
+ add t0, t0, t1
+ bltz t3, 1b
+
+.Ldone:
+ ret
+
+.size strcpy, .-strcpy
+libc_hidden_builtin_def (strcpy)
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,22 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+/* strcspn is implemented in strspn.S
+ */
new file mode 100644
@@ -0,0 +1,67 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+.globl strlen
+.type strlen,@function
+
+/*
+ * optimized strlen for riscv with vector extension
+ * assumptions:
+ * - vlenb is a power of 2
+ * - page size >= 2*vlenb
+ */
+
+.align 2
+strlen:
+ mv t4, a0 /* copy of buffer start */
+ csrr t1, vlenb /* find vlenb*2 */
+ add t1, t1, t1
+ addi t2, t1, -1 /* mask off unaligned part of ptr */
+ and t2, a0, t2
+ beqz t2, .Laligned
+
+ sub t2, t1, t2 /* search fwd to align ptr */
+ vsetvli t2, t2, e8, m2, ta, ma
+ vle8.v v2, (a0)
+ vmseq.vx v0, v2, zero
+ vfirst.m t3, v0
+ bgez t3, .Lfound
+ add a0, a0, t2
+
+.Laligned:
+ vsetvli zero, t1, e8, m2, ta, ma /* search 2*vlenb bytes per pass */
+ add t4, t4, t1
+
+1:
+ vle8.v v2, (a0)
+ add a0, a0, t1
+ vmseq.vx v0, v2, zero
+ vfirst.m t3, v0
+ bltz t3, 1b
+
+.Lfound: /* found the 0; subtract */
+ sub a0, a0, t4 /* buffer start from current ptr */
+ add a0, a0, t3 /* and add offset into fetched */
+ ret /* data to get length */
+
+.size strlen, .-strlen
+libc_hidden_builtin_def (strlen)
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,104 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+.globl strncmp
+.type strncmp,@function
+
+.align 2
+
+/* as strcmp, but with added checks on a2 (max count)
+ */
+
+strncmp:
+ csrr t1, vlenb /* find vlenb*2 */
+ add t1, t1, t1
+ blt a2, t1, .Ltail /* degrade if max < vlenb*2 */
+ vsetvli zero, t1, e8, m2, ta, mu
+ vid.v v30
+ addi t2, t1, -1 /* mask unaligned part of ptr */
+ and t6, a0, t2 /* unaligned part of lhs */
+ and t5, a1, t2 /* unaligned part of rhs */
+ sub t6, t1, t6 /* safe count to read from lhs */
+ sub t5, t1, t5 /* same, rhs */
+ vmsltu.vx v28, v30, t6 /* mask for first part of lhs */
+ vmsltu.vx v26, v30, t5 /* mask for first part of rhs */
+ vmv.v.x v16, zero
+ vmv.v.x v18, zero
+
+
+1: blt a2, t1, .Ltail
+ vmv.v.v v0, v28 /* lhs mask */
+ vle8.v v2, (a0), v0.t /* masked load from lhs */
+ vmseq.vx v16, v2, zero, v0.t /* check loaded bytes for null */
+ vmv.v.v v0, v26 /* rhs mask */
+ vfirst.m t2, v16 /* get lhs check result */
+ bgez t2, .Ltail /* can we safely check rest */
+ vle8.v v4, (a1), v0.t /* masked load from rhs */
+ vmseq.vx v18, v4, zero, v0.t /* check partial rhs */
+ vmnot.m v0, v28 /* mask for rest of lhs */
+ vfirst.m t3, v18 /* get check result */
+ bltz t3, 2f /* test it */
+ bge t3, t6, .Ltail
+
+ vmsleu.vx v0, v30, t3 /* select rest of string + null */
+ vmsne.vv v0, v2, v4, v0.t /* compare */
+ vfirst.m t3, v0
+ bgez t3, 3f
+ mv a0, zero
+ ret
+3: add a0, a0, t3
+ add a1, a1, t3
+ lbu t0, (a0)
+ lbu t1, (a1)
+.Ldiff:
+ sub a0, t0, t1
+ ret
+
+ /* ...no null terminator in first part of lhs or rhs */
+2: vle8.v v2, (a0), v0.t /* load rest of lhs */
+ vmnot.m v0, v26 /* mask for rest of rhs */
+ vle8.v v4, (a1), v0.t /* load rest of rhs */
+ vmsne.vv v0, v2, v4 /* compare */
+ add a0, a0, t1 /* advance ptrs */
+ vfirst.m t3, v0
+ add a1, a1, t1
+ sub a2, a2, t1
+ bltz t3, 1b
+
+ sub t3, t3, t1 /* found a diff but we've already advanced a0 and a1 */
+ j 3b
+
+.Ltail:
+ beqz a2, 1f
+ addi a2, a2, -1
+ lbu t0, (a0)
+ lbu t1, (a1)
+ bne t0, t1, .Ldiff
+ addi a0, a0, 1
+ addi a1, a1, 1
+ bnez t0, .Ltail
+1: mv a0, zero
+ ret
+
+
+.size strncmp, .-strncmp
+libc_hidden_builtin_def (strncmp)
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,96 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+.globl strncpy
+.type strncpy,@function
+
+/*
+ * optimized strcpy for riscv with vector extension
+ * assumptions:
+ * - vlenb is a power of 2
+ * - page size >= 2*vlenb
+ */
+
+.align 2
+strncpy:
+ mv t0, a0 /* need to return dest so copy */
+
+ csrr t1, vlenb /* find vlenb*2 */
+ add t1, t1, t1
+
+ addi t2, t1, -1 /* mask off unaligned part of ptr */
+ and t2, a1, t2
+ beqz t2, .Laligned
+
+ sub t2, t1, t2 /* search to align the pointer */
+ vsetvli zero, t2, e8, m2, tu, mu
+ vle8.v v2, (a1)
+ vmseq.vx v4, v2, zero
+ vmsif.m v0, v4 /* copy to dest */
+ vfirst.m t3, v4
+ bgeu t2, a2, .Ldest_full
+ vse8.v v2, (t0), v0.t
+ bgez t3, .Lterminator_found
+ add t0, t0, t2
+ add a1, a1, t2
+ sub a2, a2, t2
+ beqz a2, .Ldone
+
+.Laligned:
+ vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
+
+1:
+ vle8.v v2, (a1)
+ add a1, a1, t1
+ vmseq.vx v4, v2, zero
+ vmsif.m v0, v4
+ vfirst.m t3, v4
+ bgeu t1, a2, .Ldest_full
+ vse8.v v2, (t0), v0.t
+ add t0, t0, t1
+ sub a2, a2, t1
+ bltz t3, 1b
+ sub t0, t0, t1
+
+.Lterminator_found:
+ addi sp, sp, -16
+ sd ra, 0(sp)
+ sd a0, 8(sp)
+ add a0, t0, t3
+ mv a1, zero
+ sub a2, a2, t3
+ jal ra, memset
+ ld ra, 0(sp)
+ ld a0, 8(sp)
+ addi sp, sp, 16
+.Ldone:
+ ret
+
+.Ldest_full:
+ vid.v v6
+ vmsltu.vx v4, v6, a2
+ vmand.mm v0, v0, v4
+ vse8.v v2, (t0), v0.t
+ ret
+
+.size strncpy, .-strncpy
+libc_hidden_builtin_def (strncpy)
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,81 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+.globl __strnlen
+.type __strnlen,@function
+
+/* vector optimized strnlen
+ * assume it's safe to read to the end of the page
+ * containing either a null terminator or the last byte of the count or both,
+ * but not past it
+ * assume page size >= vlenb*2
+ */
+
+.align 2
+__strnlen:
+ mv t4, a0 /* stash a copy of start for later */
+ beqz a1, .LzeroCount
+
+ csrr t1, vlenb /* find vlenb*2 */
+ add t1, t1, t1
+ addi t2, t1, -1 /* mask off unaligned part of ptr */
+ and t2, a1, a0
+ beqz t2, .Laligned
+
+ sub t2, t1, t2 /* search to align pointer to t1 */
+ bgeu t2, a1, 2f /* check it's safe */
+ mv t2, a1 /* it's not! look as far as permitted */
+2: vsetvli t2, t2, e8, m2, ta, ma
+ vle8.v v2, (a0)
+ vmseq.vx v0, v2, zero
+ vfirst.m t3, v0
+ bgez t3, .Lfound
+ add a0, a0, t2
+ sub a1, a1, t2
+ bltu a1, t1, .LreachedCount
+
+.Laligned:
+ vsetvli zero, t1, e8, m2, ta, ma /* do 2*vlenb bytes per pass */
+
+1: vle8.v v2, (a0)
+ sub a1, a1, t1
+ vmseq.vx v0, v2, zero
+ vfirst.m t3, v0
+ bgez t3, .Lfound
+ add a0, a0, t1
+ bgeu a1, t1, 1b
+.LreachedCount:
+ mv t2, a1 /* in case 0 < a1 < t1 */
+ bnez a1, 2b /* if so, still t2 bytes to check, all safe */
+.LzeroCount:
+ sub a0, a0, t4
+ ret
+
+.Lfound: /* found the 0; subtract buffer start from current pointer */
+ add a0, a0, t3 /* and add offset into fetched data */
+ sub a0, a0, t4
+ ret
+
+.size __strnlen, .-__strnlen
+weak_alias (__strnlen, strnlen)
+libc_hidden_builtin_def (__strnlen)
+libc_hidden_builtin_def (strnlen)
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,88 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+.globl strrchr
+.type strrchr,@function
+
+/*
+ * optimized strrchr for riscv with vector extension
+ * assumptions:
+ * - vlenb is a power of 2
+ * - page size >= 2*vlenb
+ */
+
+.align 2
+strrchr:
+ mv t5, a0 /* stash buffer ptr somewhere safe */
+ mv a0, zero /* result is nullptr unless we find better below */
+
+ csrr t1, vlenb /* determine vlenb*2 */
+ add t1, t1, t1
+ addi t2, t1, -1 /* mask off unaligned part of ptr */
+ and t2, t5, t2
+ beqz t2, .Laligned
+
+ sub t2, t1, t2 /* search to align ptr to 2*vlenb */
+ vsetvli t2, t2, e8, m2, ta, mu
+
+ vle8.v v2, (t5) /* load data into v2(,v3) */
+ vmseq.vx v4, v2, zero /* check for null terminator */
+ vfirst.m t4, v4 /* grab its position, if any */
+ vmsbf.m v0, v4 /* select valid chars */
+ vmseq.vx v0, v2, a1, v0.t /* search for candidate byte */
+ vfirst.m t3, v0 /* grab its position, if any */
+ bltz t3, 2f /* did we find a candidate? */
+
+3: add a0, t3, t5 /* we did! grab the address */
+ vmsof.m v1, v0 /* there might be more than one */
+ vmandn.mm v0, v0, v1 /* so clear the one we just found */
+ vfirst.m t3, v0 /* is there another? */
+ bgez t3, 3b
+
+2: bgez t4, .Ldone /* did we see a null terminator? */
+ add t5, t5, t2
+
+.Laligned:
+ vsetvli zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
+
+1: vle8.v v2, (t5)
+ vmseq.vx v4, v2, zero
+ vfirst.m t4, v4
+ vmsbf.m v0, v4
+ vmseq.vx v0, v2, a1, v0.t
+ vfirst.m t3, v0
+ bltz t3, 2f
+
+3: add a0, t3, t5
+ vmsof.m v1, v0
+ vmandn.mm v0, v0, v1
+ vfirst.m t3, v0
+ bgez t3, 3b
+
+2: add t5, t5, t1
+ bltz t4, 1b
+
+.Ldone:
+ ret
+
+.size strrchr, .-strrchr
+libc_hidden_builtin_def (strrchr)
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,189 @@
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+.globl strspn
+.type strspn,@function
+
+.globl strcspn
+.type strcspn,@function
+
+/*
+ * optimized strspn / strcspn for riscv with vector extension
+ * assumptions:
+ * - vlenb is a power of 2
+ * - page size >= 32
+ * strategy:
+ * - build a 256-bit table on the stack, where each elt is zero
+ * if encountering it should terminate computation and nonzero otherwise
+ * - use vectorised lookups into this to check 2*vlen elts at a time;
+ * this code is identical for strspan and strcspan and can be shared
+ *
+ * note that while V mandates at least 128 bit wide regs,
+ * we are building a 256 bit lookup table
+ * therefore we use either LMUL=1 or 2 depending on what the target supports
+ * therefore we only use even vector register numbers,
+ * so everything still works if we go with LMUL=2
+ */
+
+# -----------------------------
+
+.align 2
+
+strspn:
+ lbu t0, 0(a1)
+ bnez t0, .Lbuild_table
+ mv a0, zero
+ ret
+
+.Lbuild_table:
+ mv a6, a0 /* store incoming a0 */
+ li t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
+
+ vsetvli zero, t1, e8, m1, tu, mu
+#if __riscv_v_min_vlen < 256
+ /* we want to build a 256-bit table, so use vlenb*2,
+ * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
+ * 'V' extension specifies a minimum vlen of 128 so this should cover
+ * all cases; we can skip the check if we know vlen >= 256 at compile time
+ */
+ csrr t2, vlenb
+ bgeu t2, t1, 1f
+ vsetvli zero, t1, e8, m2, tu, mu
+1:
+#endif // __riscv_v_min_vlen
+
+ /* read one char from the charset at a time and write the correct bit
+ * in the lookup table; we could do SIMD iff we ever get an extension
+ * that provides some way of scattering bytes into a reg group
+ */
+ vmv.v.x v16, zero /* clear out table */
+ vmv.v.x v8, zero /* clear out v8 */
+ li t3, 1
+ vmv.s.x v8, t3 /* v8 now all zeroes except bottom byte */
+
+1: vmv.v.x v2, zero /* clear out v2 */
+ addi a1, a1, 1 /* advance charset ptr */
+ srli t2, t0, 3 /* divide the byte we read earlier by 8 */
+ vslideup.vx v2, v8, t2 /* v2 now 1 in the correct byte 0 elsewhere */
+ vsll.vx v2, v2, t0 /* v2 now 1 in the correct bit, 0 elsewhere */
+ vor.vv v16, v16, v2 /* or it in */
+ lbu t0, 0(a1) /* fetch next bute */
+ bnez t0, 1b /* if it's null, go round again */
+
+/*
+ * Table is now built in v16.
+ * Strategy:
+ * - fetch next t1 bytes from memory
+ * - vrgather on their values divided by 8 to get relevant bytes of table
+ * - shift right to get the correct bit into bit 1
+ * - and with 1, compare with expected terminator value, then check mask
+ * to see if we've found a terminator
+ *
+ * Before we can begin, a0 needs to be t1-aligned, so that when we fetch
+ * the next t1 bytes - any of which may be the null terminator -
+ * we do not cross a page boundary and read unmapped memory. Therefore
+ * we have one read of however many bytes are needed to align a0,
+ * before the main loop.
+ */
+
+.Lscan_table:
+ vmv.v.x v8, t3 /* v8 now t1 bytes of 0x01 */
+
+ and t2, a0, t1 /* mask to align to t1 */
+ beqz t2, 2f /* or skip if we're already aligned */
+ sub t2, t1, t2 /* t2 now bytes to read to align to t1 */
+
+ vid.v v2 /* build mask instead of changing vl */
+ vmsltu.vx v0, v2, t2 /* so we don't need to track LMUL */
+
+ vle8.v v2, (a0), v0.t /* load next bytes from input */
+ vsrl.vi v4, v2, 3 /* divide by 8 */
+ vrgather.vv v6, v16, v4 /* corresponding bytes of bit table */
+ vsrl.vv v6, v6, v2 /* shift correct bits to lsb */
+ vand.vv v6, v6, v8 /* and with 1 to complete the lookups */
+ vmseq.vx v4, v6, zero, v0.t /* check to see if any 0s are present */
+ vfirst.m t0, v4 /* index of the first 0, if any */
+ bgez t0, .Lscan_end /* if we found one, stop */
+ add a0, a0, t2 /* advance by number of bytes we read */
+
+2: add a6, a6, t1 /* we'll advance a0 before the exit check */
+1: vle8.v v2, (a0) /* as above but unmasked so t1 elts per pass */
+ add a0, a0, t1
+
+ vsrl.vi v4, v2, 3
+ vrgather.vv v6, v16, v4
+ vsrl.vv v6, v6, v2
+ vand.vv v6, v6, v8
+
+ vmseq.vx v4, v6, zero
+ vfirst.m t0, v4
+ bltz t0, 1b
+
+.Lscan_end:
+ add a0, a0, t0 /* calculate offset to terminating byte */
+ sub a0, a0, a6
+ ret
+.size strspn, .-strspn
+
+/* strcspn
+ *
+ * table build exactly as for strspn, except:
+ * - the lookup table starts with all bits except bit 0 of byte 0 set
+ * - we clear the corresponding bit for each byte in the charset
+ * once table is built, we can reuse the scan code directly
+ */
+
+strcspn:
+ lbu t0, 0(a1)
+ beqz t0, strlen /* no rejections -> prefix is whole string */
+
+ mv a6, a0
+ li t1, 32
+
+ vsetvli zero, t1, e8, m1, tu, mu
+#if __riscv_v_min_vlen < 256
+ csrr t2, vlenb
+ bgeu t2, t1, 1f
+ vsetvli zero, t1, e8, m2, tu, mu
+1:
+#endif // __riscv_v_min_vlen
+
+ vmv.v.x v8, zero
+ li t3, 1 /* all bits clear except bit 0 of byte 0 */
+ vmv.s.x v8, t3
+ vnot.v v16, v8 /* v16 is the inverse of that */
+ li t4, -1
+
+1: vmv.v.x v2, zero
+ addi a1, a1, 1 /* advance charset ptr */
+ srli t2, t0, 3 /* select correct bit in v2 */
+ vslideup.vx v2, v8, t2
+ vsll.vx v2, v2, t0
+ vnot.v v2, v2 /* invert */
+ vand.vv v16, v16, v2 /* clear the relevant bit of table */
+ lbu t0, 0(a1)
+ bnez t0, 1b
+ j .Lscan_table
+.size strcspn, .-strcspn
+
+libc_hidden_builtin_def (strspn)
+libc_hidden_builtin_def (strcspn)
\ No newline at end of file