[2/2] riscv: vectorised mem* and str* functions

Message ID 20230201095232.15942-2-slewis@rivosinc.com
State New
Headers
Series [1/2] riscv: sysdeps support for vectorised functions |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Sergei Lewis Feb. 1, 2023, 9:52 a.m. UTC
  Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
targeting the riscv "V" extension, version 1.0

The vectorised implementations assume VLENB of at least 128 and at least 32
registers (as mandated by the "V" extension spec). They also assume that
VLENB is a power of two which is no larger than the page size, and (as
vectorised code in glibc for other platforms does) that it is safe to read
past null terminators / buffer ends provided one does not cross a page
boundary.

Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
---
 sysdeps/riscv/rv64/rvv/Implies     |   2 +
 sysdeps/riscv/rv64/rvv/memchr.S    | 127 +++++++++++++++++++
 sysdeps/riscv/rv64/rvv/memcmp.S    |  93 ++++++++++++++
 sysdeps/riscv/rv64/rvv/memcpy.S    | 154 +++++++++++++++++++++++
 sysdeps/riscv/rv64/rvv/memmove.c   |  22 ++++
 sysdeps/riscv/rv64/rvv/memset.S    |  89 ++++++++++++++
 sysdeps/riscv/rv64/rvv/strchr.S    |  92 ++++++++++++++
 sysdeps/riscv/rv64/rvv/strchrnul.c |  22 ++++
 sysdeps/riscv/rv64/rvv/strcmp.S    | 108 +++++++++++++++++
 sysdeps/riscv/rv64/rvv/strcpy.S    |  72 +++++++++++
 sysdeps/riscv/rv64/rvv/strcspn.c   |  22 ++++
 sysdeps/riscv/rv64/rvv/strlen.S    |  67 ++++++++++
 sysdeps/riscv/rv64/rvv/strncmp.S   | 104 ++++++++++++++++
 sysdeps/riscv/rv64/rvv/strncpy.S   |  96 +++++++++++++++
 sysdeps/riscv/rv64/rvv/strnlen.S   |  81 +++++++++++++
 sysdeps/riscv/rv64/rvv/strrchr.S   |  88 ++++++++++++++
 sysdeps/riscv/rv64/rvv/strspn.S    | 189 +++++++++++++++++++++++++++++
 17 files changed, 1428 insertions(+)
 create mode 100644 sysdeps/riscv/rv64/rvv/Implies
 create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
 create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
 create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
 create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
 create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
 create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
 create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
 create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
 create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
 create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
 create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
 create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
 create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
 create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
 create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
 create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
  

Comments

Jeff Law Feb. 1, 2023, 3:33 p.m. UTC | #1
On 2/1/23 02:52, Sergei Lewis wrote:
> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> targeting the riscv "V" extension, version 1.0
> 
> The vectorised implementations assume VLENB of at least 128 and at least 32
> registers (as mandated by the "V" extension spec). They also assume that
> VLENB is a power of two which is no larger than the page size, and (as
> vectorised code in glibc for other platforms does) that it is safe to read
> past null terminators / buffer ends provided one does not cross a page
> boundary.
> 
> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
> ---
>   sysdeps/riscv/rv64/rvv/Implies     |   2 +
>   sysdeps/riscv/rv64/rvv/memchr.S    | 127 +++++++++++++++++++
>   sysdeps/riscv/rv64/rvv/memcmp.S    |  93 ++++++++++++++
>   sysdeps/riscv/rv64/rvv/memcpy.S    | 154 +++++++++++++++++++++++
>   sysdeps/riscv/rv64/rvv/memmove.c   |  22 ++++
>   sysdeps/riscv/rv64/rvv/memset.S    |  89 ++++++++++++++
>   sysdeps/riscv/rv64/rvv/strchr.S    |  92 ++++++++++++++
>   sysdeps/riscv/rv64/rvv/strchrnul.c |  22 ++++
>   sysdeps/riscv/rv64/rvv/strcmp.S    | 108 +++++++++++++++++
>   sysdeps/riscv/rv64/rvv/strcpy.S    |  72 +++++++++++
>   sysdeps/riscv/rv64/rvv/strcspn.c   |  22 ++++
>   sysdeps/riscv/rv64/rvv/strlen.S    |  67 ++++++++++
>   sysdeps/riscv/rv64/rvv/strncmp.S   | 104 ++++++++++++++++
>   sysdeps/riscv/rv64/rvv/strncpy.S   |  96 +++++++++++++++
>   sysdeps/riscv/rv64/rvv/strnlen.S   |  81 +++++++++++++
>   sysdeps/riscv/rv64/rvv/strrchr.S   |  88 ++++++++++++++
>   sysdeps/riscv/rv64/rvv/strspn.S    | 189 +++++++++++++++++++++++++++++
Does this need to be revamped given the recent push to do more with 
generic code and target specific hooks for mem* and str*?

Shouldn't the implementations be in a multiarch directory?  I would 
fully expect we're going to need both a vector and scalar implementation 
selected by an ifunc.

I'm happy to pass along the current bits from VRULL which put that 
infrastructure in place.  I just haven't had the time to look at 
revamping their assembly implementations for the new generic+hooks scheme.

jeff
  
Florian Weimer Feb. 1, 2023, 4:42 p.m. UTC | #2
* Jeff Law via Libc-alpha:

> On 2/1/23 02:52, Sergei Lewis wrote:
>> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
>> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
>> targeting the riscv "V" extension, version 1.0
>> The vectorised implementations assume VLENB of at least 128 and at
>> least 32
>> registers (as mandated by the "V" extension spec). They also assume that
>> VLENB is a power of two which is no larger than the page size, and (as
>> vectorised code in glibc for other platforms does) that it is safe to read
>> past null terminators / buffer ends provided one does not cross a page
>> boundary.
>> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
>> ---
>>   sysdeps/riscv/rv64/rvv/Implies     |   2 +
>>   sysdeps/riscv/rv64/rvv/memchr.S    | 127 +++++++++++++++++++
>>   sysdeps/riscv/rv64/rvv/memcmp.S    |  93 ++++++++++++++
>>   sysdeps/riscv/rv64/rvv/memcpy.S    | 154 +++++++++++++++++++++++
>>   sysdeps/riscv/rv64/rvv/memmove.c   |  22 ++++
>>   sysdeps/riscv/rv64/rvv/memset.S    |  89 ++++++++++++++
>>   sysdeps/riscv/rv64/rvv/strchr.S    |  92 ++++++++++++++
>>   sysdeps/riscv/rv64/rvv/strchrnul.c |  22 ++++
>>   sysdeps/riscv/rv64/rvv/strcmp.S    | 108 +++++++++++++++++
>>   sysdeps/riscv/rv64/rvv/strcpy.S    |  72 +++++++++++
>>   sysdeps/riscv/rv64/rvv/strcspn.c   |  22 ++++
>>   sysdeps/riscv/rv64/rvv/strlen.S    |  67 ++++++++++
>>   sysdeps/riscv/rv64/rvv/strncmp.S   | 104 ++++++++++++++++
>>   sysdeps/riscv/rv64/rvv/strncpy.S   |  96 +++++++++++++++
>>   sysdeps/riscv/rv64/rvv/strnlen.S   |  81 +++++++++++++
>>   sysdeps/riscv/rv64/rvv/strrchr.S   |  88 ++++++++++++++
>>   sysdeps/riscv/rv64/rvv/strspn.S    | 189 +++++++++++++++++++++++++++++
> Does this need to be revamped given the recent push to do more with
> generic code and target specific hooks for mem* and str*?
>
> Shouldn't the implementations be in a multiarch directory?  I would
> fully expect we're going to need both a vector and scalar
> implementation selected by an ifunc.

I think most RISC-V GCC compilers won't have enabled IFUNC support?
Looking at gcc/config.gcc in GCC 12, I see this:

*-*-linux* | *-*-gnu*)
        case ${target} in
        aarch64*-* | arm*-* | i[34567]86-* | powerpc*-* | s390*-* | sparc*-* | x86_64-* | loongarch*-*)
                default_gnu_indirect_function=yes
                ;;
        esac

But maybe that's not the right place to look at?

We have an assembler hack to be able to still build IFUNC resolvers
written in C, but I don't know if this works on RISC-V.

Ideally the GCC defaults would change, too, and well before IFUNCs are
in common use.

Thanks,
Florian
  
Jeff Law Feb. 1, 2023, 5:07 p.m. UTC | #3
On 2/1/23 09:42, Florian Weimer wrote:
> * Jeff Law via Libc-alpha:
> 
>> On 2/1/23 02:52, Sergei Lewis wrote:
>>> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
>>> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
>>> targeting the riscv "V" extension, version 1.0
>>> The vectorised implementations assume VLENB of at least 128 and at
>>> least 32
>>> registers (as mandated by the "V" extension spec). They also assume that
>>> VLENB is a power of two which is no larger than the page size, and (as
>>> vectorised code in glibc for other platforms does) that it is safe to read
>>> past null terminators / buffer ends provided one does not cross a page
>>> boundary.
>>> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
>>> ---
>>>    sysdeps/riscv/rv64/rvv/Implies     |   2 +
>>>    sysdeps/riscv/rv64/rvv/memchr.S    | 127 +++++++++++++++++++
>>>    sysdeps/riscv/rv64/rvv/memcmp.S    |  93 ++++++++++++++
>>>    sysdeps/riscv/rv64/rvv/memcpy.S    | 154 +++++++++++++++++++++++
>>>    sysdeps/riscv/rv64/rvv/memmove.c   |  22 ++++
>>>    sysdeps/riscv/rv64/rvv/memset.S    |  89 ++++++++++++++
>>>    sysdeps/riscv/rv64/rvv/strchr.S    |  92 ++++++++++++++
>>>    sysdeps/riscv/rv64/rvv/strchrnul.c |  22 ++++
>>>    sysdeps/riscv/rv64/rvv/strcmp.S    | 108 +++++++++++++++++
>>>    sysdeps/riscv/rv64/rvv/strcpy.S    |  72 +++++++++++
>>>    sysdeps/riscv/rv64/rvv/strcspn.c   |  22 ++++
>>>    sysdeps/riscv/rv64/rvv/strlen.S    |  67 ++++++++++
>>>    sysdeps/riscv/rv64/rvv/strncmp.S   | 104 ++++++++++++++++
>>>    sysdeps/riscv/rv64/rvv/strncpy.S   |  96 +++++++++++++++
>>>    sysdeps/riscv/rv64/rvv/strnlen.S   |  81 +++++++++++++
>>>    sysdeps/riscv/rv64/rvv/strrchr.S   |  88 ++++++++++++++
>>>    sysdeps/riscv/rv64/rvv/strspn.S    | 189 +++++++++++++++++++++++++++++
>> Does this need to be revamped given the recent push to do more with
>> generic code and target specific hooks for mem* and str*?
>>
>> Shouldn't the implementations be in a multiarch directory?  I would
>> fully expect we're going to need both a vector and scalar
>> implementation selected by an ifunc.
> 
> I think most RISC-V GCC compilers won't have enabled IFUNC support?
> Looking at gcc/config.gcc in GCC 12, I see this:
> 
> *-*-linux* | *-*-gnu*)
>          case ${target} in
>          aarch64*-* | arm*-* | i[34567]86-* | powerpc*-* | s390*-* | sparc*-* | x86_64-* | loongarch*-*)
>                  default_gnu_indirect_function=yes
>                  ;;
>          esac
> 
> But maybe that's not the right place to look at?
Clearly something we need to fix.

I'd hesitate to turn on the gcc bits without having the kernel/user 
interface settled.  There was a proposal that added a syscall to get the 
processor capabilities -- I'd asked the authors to reach out to you and 
Carlos on whether or not that was acceptable for glibc.  I'm not sure if 
that happened or not.

> 
> We have an assembler hack to be able to still build IFUNC resolvers
> written in C, but I don't know if this works on RISC-V.
It probably doesn't yet.

> 
> Ideally the GCC defaults would change, too, and well before IFUNCs are
> in common use.
They're not common, but I suspect that'll change in the next ~6 months.

Jeff
  
Adhemerval Zanella Netto Feb. 1, 2023, 5:17 p.m. UTC | #4
On 01/02/23 12:33, Jeff Law via Libc-alpha wrote:
> 
> 
> On 2/1/23 02:52, Sergei Lewis wrote:
>> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
>> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
>> targeting the riscv "V" extension, version 1.0
>>
>> The vectorised implementations assume VLENB of at least 128 and at least 32
>> registers (as mandated by the "V" extension spec). They also assume that
>> VLENB is a power of two which is no larger than the page size, and (as
>> vectorised code in glibc for other platforms does) that it is safe to read
>> past null terminators / buffer ends provided one does not cross a page
>> boundary.
>>
>> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
>> ---
>>   sysdeps/riscv/rv64/rvv/Implies     |   2 +
>>   sysdeps/riscv/rv64/rvv/memchr.S    | 127 +++++++++++++++++++
>>   sysdeps/riscv/rv64/rvv/memcmp.S    |  93 ++++++++++++++
>>   sysdeps/riscv/rv64/rvv/memcpy.S    | 154 +++++++++++++++++++++++
>>   sysdeps/riscv/rv64/rvv/memmove.c   |  22 ++++
>>   sysdeps/riscv/rv64/rvv/memset.S    |  89 ++++++++++++++
>>   sysdeps/riscv/rv64/rvv/strchr.S    |  92 ++++++++++++++
>>   sysdeps/riscv/rv64/rvv/strchrnul.c |  22 ++++
>>   sysdeps/riscv/rv64/rvv/strcmp.S    | 108 +++++++++++++++++
>>   sysdeps/riscv/rv64/rvv/strcpy.S    |  72 +++++++++++
>>   sysdeps/riscv/rv64/rvv/strcspn.c   |  22 ++++
>>   sysdeps/riscv/rv64/rvv/strlen.S    |  67 ++++++++++
>>   sysdeps/riscv/rv64/rvv/strncmp.S   | 104 ++++++++++++++++
>>   sysdeps/riscv/rv64/rvv/strncpy.S   |  96 +++++++++++++++
>>   sysdeps/riscv/rv64/rvv/strnlen.S   |  81 +++++++++++++
>>   sysdeps/riscv/rv64/rvv/strrchr.S   |  88 ++++++++++++++
>>   sysdeps/riscv/rv64/rvv/strspn.S    | 189 +++++++++++++++++++++++++++++
> Does this need to be revamped given the recent push to do more with generic code and target specific hooks for mem* and str*?
> 

I should be doable, although I think it might require some more hooks since
afaiu RISCV vector instruction does not have the concept of address update
with his vector approach (as my generic string routines does by using the
'vector' type op_t).  So I am not sure it is a well fit for RISCV vec
approach.


> Shouldn't the implementations be in a multiarch directory?  I would fully expect we're going to need both a vector and scalar implementation selected by an ifunc.

If so, we will need both IFUNC support on riscv (which as Florian pointed out
it is not default); and a way to discover at runtime the processor/kernel
capabilities.  It does not seem that RISCV has the expected hwcap support,
the UAPI only define COMPAT_HWCAP_ISA_* that also does not seems to match
the vector or bitmanip extensions.  Does it have a instruction to query for
such information, something like cpuid (x86) or midr_elX (aarch64)?

It would be interesting to add a ifunc variant for bitmanip as well.

> 
> I'm happy to pass along the current bits from VRULL which put that infrastructure in place.  I just haven't had the time to look at revamping their assembly implementations for the new generic+hooks scheme.

I just sent an updated version [1] where I added bitmanip optimization [2].
So now with a recent gcc (I tested with gcc 13 with upstream qemu), the
string routines should use ctz/clz/orc.b.  I did not add support for
xthread [3], but it should be doable.

Once I get this upstream, I have a WIP to revamp the memmove/memcpy/memset/memcmp
as well.  At least for memcmp it should use bitmanip as well.
 
[1] https://patchwork.sourceware.org/project/glibc/list/?series=16622
[2] https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-21-adhemerval.zanella@linaro.org/
[3] https://lore.kernel.org/all/20220906122243.1243354-1-christoph.muellner@vrull.eu/
  
Adhemerval Zanella Netto Feb. 1, 2023, 5:38 p.m. UTC | #5
On 01/02/23 06:52, Sergei Lewis wrote:
> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> targeting the riscv "V" extension, version 1.0
> 
> The vectorised implementations assume VLENB of at least 128 and at least 32
> registers (as mandated by the "V" extension spec). They also assume that
> VLENB is a power of two which is no larger than the page size, and (as
> vectorised code in glibc for other platforms does) that it is safe to read
> past null terminators / buffer ends provided one does not cross a page
> boundary.
> 
> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>

Some comments that might be useful since I am working the generic implementations
below.

Also, I think it should be splitted with one implementation per patch, unless the
implementation is tied together (as for strchr/strchrnul for instance).  Does
the vectorized routine only work for rv64?

> ---
>  sysdeps/riscv/rv64/rvv/Implies     |   2 +
>  sysdeps/riscv/rv64/rvv/memchr.S    | 127 +++++++++++++++++++
>  sysdeps/riscv/rv64/rvv/memcmp.S    |  93 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/memcpy.S    | 154 +++++++++++++++++++++++
>  sysdeps/riscv/rv64/rvv/memmove.c   |  22 ++++
>  sysdeps/riscv/rv64/rvv/memset.S    |  89 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strchr.S    |  92 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strchrnul.c |  22 ++++
>  sysdeps/riscv/rv64/rvv/strcmp.S    | 108 +++++++++++++++++
>  sysdeps/riscv/rv64/rvv/strcpy.S    |  72 +++++++++++
>  sysdeps/riscv/rv64/rvv/strcspn.c   |  22 ++++
>  sysdeps/riscv/rv64/rvv/strlen.S    |  67 ++++++++++
>  sysdeps/riscv/rv64/rvv/strncmp.S   | 104 ++++++++++++++++
>  sysdeps/riscv/rv64/rvv/strncpy.S   |  96 +++++++++++++++
>  sysdeps/riscv/rv64/rvv/strnlen.S   |  81 +++++++++++++
>  sysdeps/riscv/rv64/rvv/strrchr.S   |  88 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strspn.S    | 189 +++++++++++++++++++++++++++++
>  17 files changed, 1428 insertions(+)
>  create mode 100644 sysdeps/riscv/rv64/rvv/Implies
>  create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
> 
> diff --git a/sysdeps/riscv/rv64/rvv/Implies b/sysdeps/riscv/rv64/rvv/Implies
> new file mode 100644
> index 0000000000..b07b4cb906
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/Implies
> @@ -0,0 +1,2 @@
> +riscv/rv64/rvd
> +
> diff --git a/sysdeps/riscv/rv64/rvv/memchr.S b/sysdeps/riscv/rv64/rvv/memchr.S
> new file mode 100644
> index 0000000000..a7e32b8f25
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memchr.S
> @@ -0,0 +1,127 @@
> +

Spurious new line at the start.  We also require a brief comment describing
the file contents for newer files.

> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.

Not sure 2012 range fits here.

> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memchr for riscv with vector extension
> + * Assumptions:
> + *    - cpu becomes bandwidth limited at or before
> + *            2 vector register sized read/write operations
> + *          + 2 scalar operations
> + *          + conditional branch
> + */
> +
> +.globl  memchr
> +.type   memchr,@function
> +
> +.align    2
> +memchr:

We have the ENTRY macro for that.

> +    beqz a2, .Lnot_found

Maybe use the L macro here for local labels;

> +    csrr    t1, vlenb
> +    bgeu    a2, t1, .Lvector_path   /* only use vector path if we're scanning
> +                                       at least vlenb bytes */
> +
> +#ifndef __riscv_strict_align

Would this be defined by compiler as predefine macro or is it just a debug
switch? If the later, I think it would be better to remove it.

> +    li a3, 8
> +    blt a2, a3, .Lbytewise
> +
> +    li      t1, 0x0101010101010101
> +    slli    a4, t1, 7     /* a4 = 0x8080808080808080 */
> +    mul     t2, a1, t1    /* entirety of t2 is now repeats of target character;
> +                             assume mul is at worst no worse than 3*(shift+OR),
> +                             otherwise do that instead */
> +
> +/*
> + * strategy:
> + * t4 = ((*a0) ^ t2)
> + *      - now t4 contains zero bytes if and only if next word of memory
> + *        had target character at those positions
> + *
> + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
> + *      - all nonzero bytes of t4 become 0; zero bytes become 0x80
> + *
> + * if t4 is nonzero, find the index of the byte within it, add to a0 and return
> + * otherwise, loop
> + */
> +
> +1:
> +    ld t4, (a0)          /* t4 = load next 8 bytes */
> +    xor t4, t4, t2
> +    sub t5, t4, t1
> +    not t4, t4
> +    and t4, t5, t4
> +    and t4, t4, a4
> +    bnez t4, .Lbytewise  /* could use ctzw, mod+lookup or just binary chop
> +                            to locate byte of interest in t4 but profiling
> +                            shows these approaches are at best no better */
> +    addi a2, a2, -8
> +    addi a0, a0, 8
> +    bgeu a2, a3, 1b
> +    beqz a2, .Lnot_found
> +#endif // __riscv_strict_align
> +
> +/* too little data for a dword. mask calculation and branch mispredict costs
> +   make checking a word not worthwhile. degrade to bytewise search. */
> +
> +.Lbytewise:
> +    add t2, a0, a2
> +
> +1:
> +    lb t1, (a0)
> +    beq t1, a1, .Lfound
> +    addi a0, a0, 1
> +    blt a0, t2, 1b
> +
> +.Lnot_found:
> +    mv a0, zero
> +.Lfound:
> +    ret
> +
> +.Lvector_path:
> +    vsetvli t2, a2, e8, m2, ta, ma
> +
> +1:
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, a1
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_found
> +    add         a0, a0, t2
> +    sub         a2, a2, t2
> +    bge         a2, t2, 1b
> +    bnez        a2, 2f
> +    mv          a0, zero
> +    ret
> +
> +2:
> +    vsetvli t2, a2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, a1
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_found
> +    mv          a0, zero
> +    ret
> +
> +.Lvec_found:
> +    add a0, a0, t3
> +    ret
> +
> +.size   memchr, .-memchr
> +libc_hidden_builtin_def (memchr)
> \ No newline at end of file

Please add a newline.

> diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S b/sysdeps/riscv/rv64/rvv/strcpy.S
> new file mode 100644
> index 0000000000..b21909d66f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
> @@ -0,0 +1,72 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +

You can add a optimize stpcpy and use to implement strcpy on top of that
(as my generic proposal does [1]). ARMv6 does something similar [2]

[1] https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-12-adhemerval.zanella@linaro.org/
[2] https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/arm/armv6/strcpy.S;h=e9f63a56c1c605a21b05f7ac21412585b0705171;hb=HEAD

> +#include <sysdep.h>
> +
> +.globl  strcpy
> +.type   strcpy,@function
> +
> +/*
> + *  optimized strcpy for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strcpy:
> +    mv          t0, a0                     /* copy dest so we can return it */
> +
> +    csrr        t1, vlenb                  /* find vlenb*2 */
> +    add         t1, t1, t1
> +
> +    addi        t2, t1, -1                 /* mask unaligned part of ptr */
> +    and         t2, a1, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                 /* search enough to align ptr */
> +    vsetvli     t2, t2, e8, m2, tu, mu
> +    vle8.v      v2, (a1)
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4                     /* copy but not past null */
> +    vfirst.m    t3, v4
> +    vse8.v      v2, (t0), v0.t
> +    bgez        t3, .Ldone
> +    add         t0, t0, t2
> +    add         a1, a1, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu   /* now do 2*vlenb bytes per pass */
> +
> +1:
> +    vle8.v      v2, (a1)
> +    add         a1, a1, t1
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4
> +    vfirst.m    t3, v4
> +    vse8.v      v2, (t0), v0.t
> +    add         t0, t0, t1
> +    bltz        t3, 1b
> +
> +.Ldone:
> +    ret
> +
> +.size   strcpy, .-strcpy
> +libc_hidden_builtin_def (strcpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c b/sysdeps/riscv/rv64/rvv/strcspn.c
> new file mode 100644
> index 0000000000..f0595a72fb
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* strcspn is implemented in strspn.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strlen.S b/sysdeps/riscv/rv64/rvv/strlen.S
> new file mode 100644
> index 0000000000..c77d500693
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strlen.S
> @@ -0,0 +1,67 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strlen
> +.type   strlen,@function
> +
> +/*
> + *  optimized strlen for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strlen:
> +    mv          t4, a0                    /* copy of buffer start */
> +    csrr        t1, vlenb                 /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1                /* mask off unaligned part of ptr */
> +    and         t2, a0, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                /* search fwd to align ptr */
> +    vsetvli     t2, t2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, ma  /* search 2*vlenb bytes per pass */
> +    add         t4, t4, t1
> +
> +1:
> +    vle8.v      v2, (a0)
> +    add         a0, a0, t1
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bltz        t3, 1b
> +
> +.Lfound:                                  /* found the 0; subtract          */
> +    sub         a0, a0, t4                /* buffer start from current ptr  */
> +    add         a0, a0, t3                /* and add offset into fetched    */
> +    ret                                   /* data to get length */
> +
> +.size   strlen, .-strlen
> +libc_hidden_builtin_def (strlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S b/sysdeps/riscv/rv64/rvv/strncmp.S
> new file mode 100644
> index 0000000000..863e5cb525
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
> @@ -0,0 +1,104 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strncmp
> +.type   strncmp,@function
> +
> +.align    2
> +
> +/* as strcmp, but with added checks on a2 (max count)
> + */
> +
> +strncmp:
> +    csrr        t1, vlenb                   /* find vlenb*2 */
> +    add         t1, t1, t1
> +    blt         a2, t1, .Ltail              /* degrade if max < vlenb*2 */
> +    vsetvli     zero, t1, e8, m2, ta, mu
> +    vid.v       v30
> +    addi        t2, t1, -1                  /* mask unaligned part of ptr */
> +    and         t6, a0, t2                  /* unaligned part of lhs */
> +    and         t5, a1, t2                  /* unaligned part of rhs */
> +    sub         t6, t1, t6                  /* safe count to read from lhs */
> +    sub         t5, t1, t5                  /* same, rhs */
> +    vmsltu.vx   v28, v30, t6                /* mask for first part of lhs */
> +    vmsltu.vx   v26, v30, t5                /* mask for first part of rhs */
> +    vmv.v.x     v16, zero
> +    vmv.v.x     v18, zero
> +
> +
> +1:  blt         a2, t1, .Ltail
> +    vmv.v.v     v0, v28                     /* lhs mask */
> +    vle8.v      v2, (a0), v0.t              /* masked load from lhs */
> +    vmseq.vx    v16, v2, zero, v0.t         /* check loaded bytes for null */
> +    vmv.v.v     v0, v26                     /*       rhs mask */
> +    vfirst.m    t2, v16                     /* get lhs check result */
> +    bgez        t2, .Ltail                  /* can we safely check rest */
> +    vle8.v      v4, (a1), v0.t              /*       masked load from rhs */
> +    vmseq.vx    v18, v4, zero, v0.t         /*       check partial rhs */
> +    vmnot.m     v0, v28                     /* mask for rest of lhs */
> +    vfirst.m    t3, v18                     /* get check result */
> +    bltz        t3, 2f                      /* test it */
> +    bge         t3, t6, .Ltail
> +
> +    vmsleu.vx   v0, v30, t3                 /* select rest of string + null */
> +    vmsne.vv    v0, v2, v4, v0.t            /* compare */
> +    vfirst.m    t3, v0
> +    bgez        t3, 3f
> +    mv          a0, zero
> +    ret
> +3:  add a0, a0, t3
> +    add a1, a1, t3
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +.Ldiff:
> +    sub a0, t0, t1
> +    ret
> +
> +    /* ...no null terminator in first part of lhs or rhs */
> +2:  vle8.v      v2, (a0), v0.t              /* load rest of lhs */
> +    vmnot.m     v0, v26                     /* mask for rest of rhs */
> +    vle8.v      v4, (a1), v0.t              /* load rest of rhs */
> +    vmsne.vv    v0, v2, v4                  /* compare */
> +    add         a0, a0, t1                  /* advance ptrs */
> +    vfirst.m    t3, v0
> +    add         a1, a1, t1
> +    sub         a2, a2, t1
> +    bltz        t3, 1b
> +
> +    sub t3, t3, t1  /* found a diff but we've already advanced a0 and a1 */
> +    j 3b
> +
> +.Ltail:
> +    beqz a2, 1f
> +    addi a2, a2, -1
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    bne t0, t1, .Ldiff
> +    addi a0, a0, 1
> +    addi a1, a1, 1
> +    bnez t0, .Ltail
> +1:  mv a0, zero
> +    ret
> +
> +
> +.size strncmp, .-strncmp
> +libc_hidden_builtin_def (strncmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S b/sysdeps/riscv/rv64/rvv/strncpy.S
> new file mode 100644
> index 0000000000..8b3a1e545c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
> @@ -0,0 +1,96 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strncpy
> +.type   strncpy,@function
> +
> +/*
> + *  optimized strcpy for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strncpy:
> +    mv          t0, a0                    /* need to return dest so copy */
> +
> +    csrr        t1, vlenb                 /* find vlenb*2 */
> +    add         t1, t1, t1
> +
> +    addi        t2, t1, -1                /* mask off unaligned part of ptr */
> +    and         t2, a1, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                /* search to align the pointer */
> +    vsetvli     zero, t2, e8, m2, tu, mu
> +    vle8.v      v2, (a1)
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4                    /* copy to dest */
> +    vfirst.m    t3, v4
> +    bgeu        t2, a2, .Ldest_full
> +    vse8.v      v2, (t0), v0.t
> +    bgez        t3, .Lterminator_found
> +    add         t0, t0, t2
> +    add         a1, a1, t2
> +    sub         a2, a2, t2
> +    beqz        a2, .Ldone
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:
> +    vle8.v      v2, (a1)
> +    add         a1, a1, t1
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4
> +    vfirst.m    t3, v4
> +    bgeu        t1, a2, .Ldest_full
> +    vse8.v      v2, (t0), v0.t
> +    add         t0, t0, t1
> +    sub         a2, a2, t1
> +    bltz        t3, 1b
> +    sub         t0, t0, t1
> +
> +.Lterminator_found:
> +    addi        sp, sp, -16
> +    sd          ra, 0(sp)
> +    sd          a0, 8(sp)
> +    add         a0, t0, t3
> +    mv          a1, zero
> +    sub         a2, a2, t3
> +    jal         ra, memset
> +    ld          ra, 0(sp)
> +    ld          a0, 8(sp)
> +    addi        sp, sp, 16
> +.Ldone:
> +    ret
> +
> +.Ldest_full:
> +    vid.v       v6
> +    vmsltu.vx   v4, v6, a2
> +    vmand.mm    v0, v0, v4
> +    vse8.v      v2, (t0), v0.t
> +    ret
> +
> +.size   strncpy, .-strncpy
> +libc_hidden_builtin_def (strncpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S b/sysdeps/riscv/rv64/rvv/strnlen.S
> new file mode 100644
> index 0000000000..6d7ee65c7a
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
> @@ -0,0 +1,81 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +

Maybe use a generic implementation that issues memchr (which should be optimized
using vector instructions) [3] ?  It would be a extra function call, but it should really
help on both code size and icache pressure.

[3] https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-6-adhemerval.zanella@linaro.org/

> +.globl  __strnlen
> +.type   __strnlen,@function
> +
> +/* vector optimized strnlen
> + * assume it's safe to read to the end of the page
> + * containing either a null terminator or the last byte of the count or both,
> + * but not past it
> + * assume page size >= vlenb*2
> + */
> +
> +.align    2
> +__strnlen:
> +    mv          t4, a0               /* stash a copy of start for later */
> +    beqz        a1, .LzeroCount
> +
> +    csrr        t1, vlenb            /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1           /* mask off unaligned part of ptr */
> +    and         t2, a1, a0
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2           /* search to align pointer to t1 */
> +    bgeu        t2, a1, 2f           /* check it's safe */
> +    mv          t2, a1               /* it's not! look as far as permitted */
> +2:  vsetvli     t2, t2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t2
> +    sub         a1, a1, t2
> +    bltu        a1, t1, .LreachedCount
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, ma    /* do 2*vlenb bytes per pass */
> +
> +1:  vle8.v      v2, (a0)
> +    sub         a1, a1, t1
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t1
> +    bgeu        a1, t1, 1b
> +.LreachedCount:
> +    mv          t2, a1    /* in case 0 < a1 < t1 */
> +    bnez        a1, 2b    /* if so, still t2 bytes to check, all safe */
> +.LzeroCount:
> +    sub         a0, a0, t4
> +    ret
> +
> +.Lfound:        /* found the 0; subtract buffer start from current pointer */
> +    add         a0, a0, t3 /* and add offset into fetched data */
> +    sub         a0, a0, t4
> +    ret
> +
> +.size   __strnlen, .-__strnlen
> +weak_alias (__strnlen, strnlen)
> +libc_hidden_builtin_def (__strnlen)
> +libc_hidden_builtin_def (strnlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S b/sysdeps/riscv/rv64/rvv/strrchr.S
> new file mode 100644
> index 0000000000..4bef8a3b9c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
> @@ -0,0 +1,88 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>

It is really worth to add a strrchr optimization?  The generic implementation
already calls strchr (which should be optimized).

> +
> +.globl  strrchr
> +.type   strrchr,@function
> +
> +/*
> + *  optimized strrchr for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strrchr:
> +    mv          t5, a0    /* stash buffer ptr somewhere safe */
> +    mv          a0, zero  /* result is nullptr unless we find better below */
> +
> +    csrr        t1, vlenb                /* determine vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1               /* mask off unaligned part of ptr */
> +    and         t2, t5, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2               /* search to align ptr to 2*vlenb */
> +    vsetvli     t2, t2, e8, m2, ta, mu
> +
> +    vle8.v      v2, (t5)                 /* load data into v2(,v3) */
> +    vmseq.vx    v4, v2, zero             /* check for null terminator */
> +    vfirst.m    t4, v4                   /* grab its position, if any */
> +    vmsbf.m     v0, v4                   /* select valid chars */
> +    vmseq.vx    v0, v2, a1, v0.t         /* search for candidate byte */
> +    vfirst.m    t3, v0                   /* grab its position, if any */
> +    bltz        t3, 2f                   /* did we find a candidate? */
> +
> +3:  add         a0, t3, t5               /* we did! grab the address */
> +    vmsof.m     v1, v0                   /* there might be more than one */
> +    vmandn.mm   v0, v0, v1               /* so clear the one we just found */
> +    vfirst.m    t3, v0                   /* is there another? */
> +    bgez        t3, 3b
> +
> +2:  bgez        t4, .Ldone               /* did we see a null terminator? */
> +    add         t5, t5, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:  vle8.v      v2, (t5)
> +    vmseq.vx    v4, v2, zero
> +    vfirst.m    t4, v4
> +    vmsbf.m     v0, v4
> +    vmseq.vx    v0, v2, a1, v0.t
> +    vfirst.m    t3, v0
> +    bltz        t3, 2f
> +
> +3:  add         a0, t3, t5
> +    vmsof.m     v1, v0
> +    vmandn.mm   v0, v0, v1
> +    vfirst.m    t3, v0
> +    bgez        t3, 3b
> +
> +2:  add         t5, t5, t1
> +    bltz        t4, 1b
> +
> +.Ldone:
> +    ret
> +
> +.size   strrchr, .-strrchr
> +libc_hidden_builtin_def (strrchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strspn.S b/sysdeps/riscv/rv64/rvv/strspn.S
> new file mode 100644
> index 0000000000..2b9af5cc2d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strspn.S
> @@ -0,0 +1,189 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strspn
> +.type   strspn,@function
> +
> +.globl  strcspn
> +.type   strcspn,@function
> +
> +/*
> + *  optimized strspn / strcspn for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 32
> + *  strategy:
> + *  - build a 256-bit table on the stack, where each elt is zero
> + *    if encountering it should terminate computation and nonzero otherwise
> + *  - use vectorised lookups into this to check 2*vlen elts at a time;
> + *    this code is identical for strspan and strcspan and can be shared
> + *
> + *  note that while V mandates at least 128 bit wide regs,
> + *  we are building a 256 bit lookup table
> + *  therefore we use either LMUL=1 or 2 depending on what the target supports
> + *  therefore we only use even vector register numbers,
> + *  so everything still works if we go with LMUL=2
> + */
> +

I wonder if we could adapt the generic implementation, so riscv only reimplements
the vectorized search instead of all the boilerplace to generate the table and
early tests.

> +# -----------------------------
> +
> +.align    2
> +
> +strspn:
> +    lbu         t0, 0(a1)
> +    bnez        t0, .Lbuild_table
> +    mv          a0, zero
> +    ret
> +
> +.Lbuild_table:
> +    mv          a6, a0 /* store incoming a0 */
> +    li          t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
> +
> +    vsetvli     zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> +    /* we want to build a 256-bit table, so use vlenb*2,
> +     * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
> +     * 'V' extension specifies a minimum vlen of 128 so this should cover
> +     * all cases; we can skip the check if we know vlen >= 256 at compile time
> +     */
> +    csrr        t2, vlenb
> +    bgeu        t2, t1, 1f
> +    vsetvli     zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> +    /* read one char from the charset at a time and write the correct bit
> +     * in the lookup table; we could do SIMD iff we ever get an extension
> +     * that provides some way of scattering bytes into a reg group
> +     */
> +    vmv.v.x     v16, zero       /* clear out table */
> +    vmv.v.x     v8, zero        /* clear out v8 */
> +    li          t3, 1
> +    vmv.s.x     v8, t3          /* v8 now all zeroes except bottom byte */
> +
> +1:  vmv.v.x     v2, zero        /* clear out v2 */
> +    addi        a1, a1, 1       /* advance charset ptr */
> +    srli        t2, t0, 3       /* divide the byte we read earlier by 8 */
> +    vslideup.vx v2, v8, t2      /* v2 now 1 in the correct byte 0 elsewhere */
> +    vsll.vx     v2, v2, t0      /* v2 now 1 in the correct bit, 0 elsewhere */
> +    vor.vv      v16, v16, v2    /* or it in */
> +    lbu         t0, 0(a1)       /* fetch next bute */
> +    bnez        t0, 1b          /* if it's null, go round again */
> +
> +/*
> + *   Table is now built in v16.
> + *   Strategy:
> + *   - fetch next t1 bytes from memory
> + *   - vrgather on their values divided by 8 to get relevant bytes of table
> + *   - shift right to get the correct bit into bit 1
> + *   - and with 1, compare with expected terminator value, then check mask
> + *     to see if we've found a terminator
> + *
> + *   Before we can begin, a0 needs to be t1-aligned, so that when we fetch
> + *   the next t1 bytes - any of which may be the null terminator -
> + *   we do not cross a page boundary and read unmapped memory. Therefore
> + *   we have one read of however many bytes are needed to align a0,
> + *   before the main loop.
> + */
> +
> +.Lscan_table:
> +    vmv.v.x     v8, t3              /* v8 now t1 bytes of 0x01 */
> +
> +    and         t2, a0, t1          /* mask to align to t1 */
> +    beqz        t2, 2f              /* or skip if we're already aligned */
> +    sub         t2, t1, t2          /* t2 now bytes to read to align to t1 */
> +
> +    vid.v       v2                  /* build mask instead of changing vl */
> +    vmsltu.vx   v0, v2, t2          /* so we don't need to track LMUL */
> +
> +    vle8.v      v2, (a0), v0.t      /* load next bytes from input */
> +    vsrl.vi     v4, v2, 3           /* divide by 8 */
> +    vrgather.vv v6, v16, v4         /* corresponding bytes of bit table */
> +    vsrl.vv     v6, v6, v2          /* shift correct bits to lsb */
> +    vand.vv     v6, v6, v8          /* and with 1 to complete the lookups */
> +    vmseq.vx    v4, v6, zero, v0.t  /* check to see if any 0s are present */
> +    vfirst.m    t0, v4              /* index of the first 0, if any */
> +    bgez        t0, .Lscan_end      /* if we found one, stop */
> +    add         a0, a0, t2          /* advance by number of bytes we read */
> +
> +2:  add         a6, a6, t1     /* we'll advance a0 before the exit check */
> +1:  vle8.v      v2, (a0)       /* as above but unmasked so t1 elts per pass */
> +    add         a0, a0, t1
> +
> +    vsrl.vi     v4, v2, 3
> +    vrgather.vv v6, v16, v4
> +    vsrl.vv     v6, v6, v2
> +    vand.vv     v6, v6, v8
> +
> +    vmseq.vx    v4, v6, zero
> +    vfirst.m    t0, v4
> +    bltz        t0, 1b
> +
> +.Lscan_end:
> +    add         a0, a0, t0     /* calculate offset to terminating byte */
> +    sub         a0, a0, a6
> +    ret
> +.size   strspn, .-strspn
> +
> +/* strcspn
> + *
> + * table build exactly as for strspn, except:
> + * - the lookup table starts with all bits except bit 0 of byte 0 set
> + * - we clear the corresponding bit for each byte in the charset
> + * once table is built, we can reuse the scan code directly
> + */
> +
> +strcspn:
> +    lbu         t0, 0(a1)
> +    beqz        t0, strlen   /* no rejections -> prefix is whole string */
> +
> +    mv          a6, a0
> +    li          t1, 32
> +
> +    vsetvli     zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> +    csrr        t2, vlenb
> +    bgeu        t2, t1, 1f
> +    vsetvli     zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> +    vmv.v.x     v8, zero
> +    li          t3, 1           /* all bits clear except bit 0 of byte 0 */
> +    vmv.s.x     v8, t3
> +    vnot.v      v16, v8         /* v16 is the inverse of that */
> +    li          t4, -1
> +
> +1:  vmv.v.x     v2, zero
> +    addi        a1, a1, 1       /* advance charset ptr */
> +    srli        t2, t0, 3       /* select correct bit in v2 */
> +    vslideup.vx v2, v8, t2
> +    vsll.vx     v2, v2, t0
> +    vnot.v      v2, v2          /* invert */
> +    vand.vv     v16, v16, v2    /* clear the relevant bit of table */
> +    lbu         t0, 0(a1)
> +    bnez        t0, 1b
> +    j           .Lscan_table
> +.size   strcspn, .-strcspn
> +
> +libc_hidden_builtin_def (strspn)
> +libc_hidden_builtin_def (strcspn)
> \ No newline at end of file
  
Noah Goldstein Feb. 1, 2023, 6:11 p.m. UTC | #6
On Wed, Feb 1, 2023 at 3:54 AM Sergei Lewis <slewis@rivosinc.com> wrote:
>
> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> targeting the riscv "V" extension, version 1.0
>
> The vectorised implementations assume VLENB of at least 128 and at least 32
> registers (as mandated by the "V" extension spec). They also assume that
> VLENB is a power of two which is no larger than the page size, and (as
> vectorised code in glibc for other platforms does) that it is safe to read
> past null terminators / buffer ends provided one does not cross a page
> boundary.

There should probably be a mention of performance gains vs the generic
implementations in the commit message to justify this.
>
> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
> ---
>  sysdeps/riscv/rv64/rvv/Implies     |   2 +
>  sysdeps/riscv/rv64/rvv/memchr.S    | 127 +++++++++++++++++++
>  sysdeps/riscv/rv64/rvv/memcmp.S    |  93 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/memcpy.S    | 154 +++++++++++++++++++++++
>  sysdeps/riscv/rv64/rvv/memmove.c   |  22 ++++
>  sysdeps/riscv/rv64/rvv/memset.S    |  89 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strchr.S    |  92 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strchrnul.c |  22 ++++
>  sysdeps/riscv/rv64/rvv/strcmp.S    | 108 +++++++++++++++++
>  sysdeps/riscv/rv64/rvv/strcpy.S    |  72 +++++++++++
>  sysdeps/riscv/rv64/rvv/strcspn.c   |  22 ++++
>  sysdeps/riscv/rv64/rvv/strlen.S    |  67 ++++++++++
>  sysdeps/riscv/rv64/rvv/strncmp.S   | 104 ++++++++++++++++
>  sysdeps/riscv/rv64/rvv/strncpy.S   |  96 +++++++++++++++
>  sysdeps/riscv/rv64/rvv/strnlen.S   |  81 +++++++++++++
>  sysdeps/riscv/rv64/rvv/strrchr.S   |  88 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strspn.S    | 189 +++++++++++++++++++++++++++++
>  17 files changed, 1428 insertions(+)
>  create mode 100644 sysdeps/riscv/rv64/rvv/Implies
>  create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
>
> diff --git a/sysdeps/riscv/rv64/rvv/Implies b/sysdeps/riscv/rv64/rvv/Implies
> new file mode 100644
> index 0000000000..b07b4cb906
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/Implies
> @@ -0,0 +1,2 @@
> +riscv/rv64/rvd
> +
> diff --git a/sysdeps/riscv/rv64/rvv/memchr.S b/sysdeps/riscv/rv64/rvv/memchr.S
> new file mode 100644
> index 0000000000..a7e32b8f25
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memchr.S
> @@ -0,0 +1,127 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memchr for riscv with vector extension
> + * Assumptions:
> + *    - cpu becomes bandwidth limited at or before
> + *            2 vector register sized read/write operations
> + *          + 2 scalar operations
> + *          + conditional branch
> + */
> +
> +.globl  memchr
> +.type   memchr,@function
> +
> +.align    2
> +memchr:
> +    beqz a2, .Lnot_found
> +    csrr    t1, vlenb
> +    bgeu    a2, t1, .Lvector_path   /* only use vector path if we're scanning
> +                                       at least vlenb bytes */
> +
> +#ifndef __riscv_strict_align
> +    li a3, 8
> +    blt a2, a3, .Lbytewise
> +
> +    li      t1, 0x0101010101010101
> +    slli    a4, t1, 7     /* a4 = 0x8080808080808080 */
> +    mul     t2, a1, t1    /* entirety of t2 is now repeats of target character;
> +                             assume mul is at worst no worse than 3*(shift+OR),
> +                             otherwise do that instead */
> +
> +/*
> + * strategy:
> + * t4 = ((*a0) ^ t2)
> + *      - now t4 contains zero bytes if and only if next word of memory
> + *        had target character at those positions
> + *
> + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
> + *      - all nonzero bytes of t4 become 0; zero bytes become 0x80
> + *
> + * if t4 is nonzero, find the index of the byte within it, add to a0 and return
> + * otherwise, loop
> + */
> +
> +1:
> +    ld t4, (a0)          /* t4 = load next 8 bytes */
> +    xor t4, t4, t2
> +    sub t5, t4, t1
> +    not t4, t4
> +    and t4, t5, t4
> +    and t4, t4, a4
> +    bnez t4, .Lbytewise  /* could use ctzw, mod+lookup or just binary chop
> +                            to locate byte of interest in t4 but profiling
> +                            shows these approaches are at best no better */
> +    addi a2, a2, -8
> +    addi a0, a0, 8
> +    bgeu a2, a3, 1b
> +    beqz a2, .Lnot_found
> +#endif // __riscv_strict_align
> +
> +/* too little data for a dword. mask calculation and branch mispredict costs
> +   make checking a word not worthwhile. degrade to bytewise search. */
> +
> +.Lbytewise:
> +    add t2, a0, a2
> +
> +1:
> +    lb t1, (a0)
> +    beq t1, a1, .Lfound
> +    addi a0, a0, 1
> +    blt a0, t2, 1b
> +
> +.Lnot_found:
> +    mv a0, zero
> +.Lfound:
> +    ret
> +
> +.Lvector_path:
> +    vsetvli t2, a2, e8, m2, ta, ma
> +
> +1:
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, a1
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_found
> +    add         a0, a0, t2
> +    sub         a2, a2, t2
> +    bge         a2, t2, 1b
> +    bnez        a2, 2f
> +    mv          a0, zero
> +    ret
> +
> +2:
> +    vsetvli t2, a2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, a1
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_found
> +    mv          a0, zero
> +    ret
> +
> +.Lvec_found:
> +    add a0, a0, t3
> +    ret
> +
> +.size   memchr, .-memchr
> +libc_hidden_builtin_def (memchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memcmp.S b/sysdeps/riscv/rv64/rvv/memcmp.S
> new file mode 100644
> index 0000000000..a945753a5f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memcmp.S
> @@ -0,0 +1,93 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +/* Optimised memcmp for riscv with vector extension
> + */
> +
> +.globl  memcmp
> +.type   memcmp,@function
> +
> +.align    2
> +
> +memcmp:
> +    mv t2, zero
> +    beqz a2, .Ldone
> +
> +    li          t1, 5            /* scalar path cheaper for 1-4 elts */
> +    bltu        a2, t1, .Lscalar
> +
> +    /* main loop, vlenb*2 elts at a time */
> +    vsetvli     t1, a2, e8, m2, ta, ma
> +
> +1:
> +    vle8.v      v2, (a0)        /* load elts */
> +    vle8.v      v4, (a1)
> +    vmsne.vv    v0, v2, v4      /* compare */
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_diff  /* found a difference ? */
> +    add         a0, a0, t1      /* not yet, advance everything */
> +    add         a1, a1, t1
> +    sub         a2, a2, t1
> +    bgeu        a2, t1, 1b
> +
> +    bnez        a2, .Ltail
> +    mv          a0, zero
> +    ret
> +
> +.Ltail:
> +    /* handle tail. we know a2 < vlenb*2 so just load and compare the lot */
> +    vsetvli     t1, a2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vle8.v      v4, (a1)
> +    vmsne.vv    v0, v2, v4
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_diff
> +    mv          a0, zero         /* no diff found */
> +    ret
> +
> +.Lvec_diff:         /* v2, v4 differ at elt t3 */
> +    add a0, a0, t3
> +    add a1, a1, t3
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    sub a0, t0, t1
> +    ret
> +
> +.Lscalar:
> +    add  t3, a0, a2
> +
> +1:
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    sub t2, t0, t1
> +    bnez t2, .Ldone
> +    addi a0, a0, 1
> +    addi a1, a1, 1
> +    bltu a0, t3, 1b
> +
> +.Ldone:
> +    mv a0, t2
> +    ret
> +
> +
> +.size memcmp, .-memcmp
> +libc_hidden_builtin_def (memcmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memcpy.S b/sysdeps/riscv/rv64/rvv/memcpy.S
> new file mode 100644
> index 0000000000..7b37ec285d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memcpy.S
> @@ -0,0 +1,154 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +/* Optimised memcpy and memmove for riscv with vector extension
> + */
> +
> +.globl  memcpy
> +.type   memcpy,@function
> +.globl  memmove
> +.type   memmove,@function
> +
> +.align    2
> +memmove:
> +    bge     a0, a1, .Lmemcpy_rev
> +
> +memcpy:
> +.Lmemcpy_fwd:
> +    mv      t0, a0          /* t0 = preserve a0 so we can return it */
> +    csrr    t2, vlenb       /* t2 = number of bytes per vectorised copy op */
> +    slli    t5, t2,  1      /* t5 = number of bytes per loop */
> +    addi    t3, t5, -1      /* generate mask */
> +    not     t4, t3
> +    and     t4, a2, t4      /* t4 = bytes copied in vectorised pass */
> +
> +    beqz    t4, .Lscalar_fwd    /* size too small for even one pass? */
> +
> +    and    a2, a2, t3           /* a2 = bytes still left to copy after pass */
> +    add    t4, t4, a1           /* t4 = src at end of vectorised pass */
> +
> +1:
> +    vl2r.v  v2, (a1)            /* load, advance source */
> +    add     a1, a1, t5
> +    vs2r.v  v2, (t0)            /* store, advance dest */
> +    add     t0, t0, t5
> +    bltu    a1, t4, 1b          /* src at end? */
> +
> +    bltu    a2, t2, .Lscalar_fwd /* should we do one more vec load/store? */
> +    vl1r.v  v2, (a1)
> +    sub     a2, a2, t2
> +    add     a1, a1, t2
> +    vs1r.v  v2, (t0)
> +    add     t0, t0, t2
> +
> +.Lscalar_fwd:
> +    bnez    a2, .Lnobail
> +.Lbail:
> +    ret
> +.Lnobail:
> +
> +#ifndef __riscv_strict_align
> +    addi    t2, zero, 4
> +    bltu    a2, t2, .Lsingle_bytes
> +1:
> +    lw      t3, 0(a1)
> +    addi    a1, a1, 4
> +    sw      t3, 0(t0)
> +    addi    t0, t0, 4
> +    addi    a2, a2, -4
> +    bgeu    a2, t2, 1b
> +#endif // __riscv_strict_align
> +
> +.Lsingle_bytes:
> +    beqz    a2, .Lbail
> +    add     a2, a2, a1          /* a2 = src + remaining size */
> +1:
> +    lb      t1, 0(a1)
> +    sb      t1, 0(t0)
> +    addi    a1, a1, 1
> +    addi    t0, t0, 1
> +    bltu    a1, a2, 1b
> +    ret
> +.size   memcpy,     .-memcpy
> +
> +
> +.Lmemcpy_rev:
> +    beq     a0, a1, .Lmemcpy_rev_done
> +    add     t0, a0, a2          /* t0 = dest so we can return a0=dest later */
> +    add     t6, a1, a2          /* dest and src both point to byte */
> +                                /* immediately after end of buffer */
> +
> +    csrr    t2, vlenb           /* t2 = number of bytes per pass */
> +    slli    t5, t2,  1          /* t5 = number of bytes per entire loop */
> +    addi    t3, t5, -1          /* t3 = (bytes per loop) mask */
> +    not     t4, t3              /* generate mask for bytes processed by loop */
> +    and     t4, a2, t4          /* t4 = bytes copied in vectorised pass */
> +
> +    beqz    t4, .Lscalar_rev    /* size too small for even one pass? */
> +
> +    and    a2, a2, t3           /* a2 = bytes still left to copy after pass */
> +    sub    t4, t6, t4           /* t4 = src at end of vectorised pass */
> +
> +1:
> +    sub     t6, t6, t5
> +    sub     t0, t0, t5
> +    vl2r.v  v2, (t6)            /* load, advance source */
> +    vs2r.v  v2, (t0)            /* store, advance dest */
> +    bgtu    t6, t4, 1b          /* src at end? */
> +
> +    bltu    a2, t2, .Lscalar_rev /* should we do one more vec load/store? */
> +    sub     t6, t6, t2
> +    sub     t0, t0, t2
> +    sub     a2, a2, t2
> +    vl1r.v  v2, (t6)
> +    vs1r.v  v2, (t0)
> +
> +.Lscalar_rev:
> +#ifndef __riscv_strict_align
> +    beqz    a2, .Lbail
> +
> +    addi    t2, zero, 4
> +    bltu    a2, t2, 2f
> +1:
> +    addi    t6, t6, -4
> +    addi    t0, t0, -4
> +    addi    a2, a2, -4
> +    lw      t3, 0(t6)
> +    sw      t3, 0(t0)
> +    bgeu    a2, t2, 1b
> +2:
> +#endif // __riscv_strict_align
> +
> +    beqz    a2, .Lbail
> +1:
> +    addi    t6, t6, -1
> +    addi    t0, t0, -1
> +    lb      t1, 0(t6)
> +    sb      t1, 0(t0)
> +    bgtu    t0, a0, 1b
> +
> +.Lmemcpy_rev_done:
> +    ret
> +
> +.size   memmove, .-memmove
> +libc_hidden_builtin_def (memcpy)
> +libc_hidden_builtin_def (memmove)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memmove.c b/sysdeps/riscv/rv64/rvv/memmove.c
> new file mode 100644
> index 0000000000..47734854f9
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memmove.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* memmove is implemented in memcpy.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/memset.S b/sysdeps/riscv/rv64/rvv/memset.S
> new file mode 100644
> index 0000000000..6f82c542b1
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memset.S
> @@ -0,0 +1,89 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memset for riscv with vector extension
> + */
> +
> +.globl  memset
> +.type   memset,@function
> +
> +.align    2
> +memset:
> +    mv      t0, a0                  /* t0 = dest so we can return a0 later */
> +    vsetvli t2, a2, e8, m2, ta, ma  /* t2 = elts per copy */
> +    beqz    t2, .Lscalar
> +
> +    vmv.v.x v2, a1                  /* splat value across v2 */
> +
> +    slli    t3, t2, 1
> +    bgtu    t3, a2, .Lsinglestore
> +
> +1:
> +    vse8.v  v2, (t0)
> +    add     t0, t0, t2
> +    vse8.v  v2, (t0)
> +    add     t0, t0, t2
> +    sub     a2, a2, t3
> +    bgeu    a2, t3, 1b
> +    bgeu    a2, t2, .Lsinglestore
> +    bnez    a2, .Lscalar
> +
> +.Lbail:
> +    ret
> +
> +.Lsinglestore:
> +    bgtu    t2, a2, .Lscalar
> +    vse8.v  v2, (t0)
> +    add     t0, t0, t2
> +    sub     a2, a2, t2
> +
> +.Lscalar:
> +    beqz    a2, .Lbail
> +
> +#ifndef __riscv_strict_align
> +    slli    t2, a1, 8
> +    or      a1, a1, t2
> +    slli    t2, a1, 16
> +    or      a1, a1, t2
> +
> +    addi    t2, zero, 4
> +    bltu    a2, t2, 2f
> +
> +1:
> +    sw      a1, 0(t0)
> +    addi    t0, t0, 4
> +    addi    a2, a2, -4
> +    bgeu    a2, t2, 1b
> +2:
> +    beqz    a2, .Lbail
> +#endif // __riscv_strict_align
> +
> +    add     a2, a2, t0
> +1:
> +    sb      a1, 0(t0)
> +    addi    t0, t0, 1
> +    bltu    t0, a2, 1b
> +    ret
> +
> +.size   memset,     .-memset
> +libc_hidden_builtin_def (memset)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strchr.S b/sysdeps/riscv/rv64/rvv/strchr.S
> new file mode 100644
> index 0000000000..0b37174c55
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strchr.S
> @@ -0,0 +1,92 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strchr
> +.type   strchr,@function
> +
> +.globl  __strchrnul
> +.type   __strchrnul,@function
> +
> +/*
> + *  optimized strchr for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +__strchrnul:
> +    li t5, -1
> +    j  1f
> +
> +strchr:
> +    mv          t5, zero
> +1:  csrr        t1, vlenb              /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1             /* mask off unaligned part of pointer */
> +    and         t2, a0, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2             /* search however many bytes
> +                                          are needed to align the pointer */
> +    vsetvli     t2, t2, e8, m2, ta, mu
> +
> +    vle8.v      v2, (a0)               /* load data into v2(,v3) */
> +    vmseq.vx    v4, v2, zero
> +    vfirst.m    t4, v4
> +    vmsbf.m     v0, v4
> +    vmseq.vx    v0, v2, a1, v0.t
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    bgez        t4, .Lbufferend
> +    add         a0, a0, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu
> +    li          t4, -1
> +
> +1:
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v4, v2, zero
> +    vfirst.m    t4, v4
> +    vmsbf.m     v0, v4
> +    vmseq.vx    v0, v2, a1, v0.t
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    bgez        t4, .Lbufferend
> +    add         a0, a0, t1
> +    j           1b
> +
> +.Lfound:                                    /* found the target at a0+t3 */
> +    add         a0, a0, t3
> +    ret
> +
> +.Lbufferend:
> +    add         a0, a0, t4
> +    and         a0, a0, t5
> +    ret
> +
> +.size   strchr, .-strchr
> +.size   __strchrnul, .-__strchrnul
> +
> +libc_hidden_builtin_def (strchr)
> +weak_alias (__strchrnul, strchrnul)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strchrnul.c b/sysdeps/riscv/rv64/rvv/strchrnul.c
> new file mode 100644
> index 0000000000..259da80358
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strchrnul.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* strchrnul is implemented in strchr.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strcmp.S b/sysdeps/riscv/rv64/rvv/strcmp.S
> new file mode 100644
> index 0000000000..4a219221ac
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcmp.S
> @@ -0,0 +1,108 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strcmp
> +.type   strcmp,@function
> +
> +.align    2
> +
> +/* most of the time, one or both sides is unaligned and their alignments differ
> + * we need to check for a null terminator before crossing a page boundary
> + * strategy:
> + * - for each side, calculate masks for alignment and (vlenb * 2) - alignment
> + * - while no difference encountered:
> + * -   for each side:
> + * -       load bytes to end of next vlenb*2 block
> + * -       check for null terminator
> + * -       if no terminator, load bytes to fill rest of register
> + * -   compare sides
> + */
> +
> +strcmp:
> +    csrr        t1, vlenb                   /* find vlenb*2 */
> +    add         t1, t1, t1
> +    vsetvli     zero, t1, e8, m2, ta, mu
> +    vid.v       v30
> +    addi        t2, t1, -1         /* mask for unaligned part of ptr */
> +    and         t6, a0, t2         /* unaligned part of lhs */
> +    and         t5, a1, t2         /* unaligned part of rhs */
> +    sub         t6, t1, t6         /* safe number of lhs bytes to read */
> +    sub         t5, t1, t5         /* same, rhs */
> +    vmsltu.vx   v28, v30, t6       /* v28 = mask for first half of lhs load */
> +    vmsltu.vx   v26, v30, t5       /* v26 = mask for first half of rhs load */
> +    vmv.v.x     v16, zero
> +    vmv.v.x     v18, zero
> +
> +1:  vmv.v.v     v0, v28              /* lhs mask */
> +    vle8.v      v2, (a0), v0.t       /* masked load from lhs */
> +    vmseq.vx    v16, v2, zero, v0.t  /* check loaded bytes for null */
> +    vmv.v.v     v0, v26              /*       rhs mask */
> +    vfirst.m    t2, v16              /* get lhs check result */
> +    bgez        t2, .Ltail           /* bail if we can't safely check rest */
> +    vle8.v      v4, (a1), v0.t       /*    masked load from rhs */
> +    vmseq.vx    v18, v4, zero, v0.t  /*    check partial rhs for null */
> +    vmnot.m     v0, v28              /* mask for rest of lhs */
> +    vfirst.m    t3, v18              /* get check result */
> +    bltz        t3, 2f               /* test it */
> +                                     /* we see null terminator */
> +    bge         t3, t6, .Ltail       /* have enough bytes for vector cmp? */
> +
> +    vmsleu.vx   v0, v30, t3          /* select rest + null */
> +    vmsne.vv    v0, v2, v4, v0.t     /* compare */
> +    vfirst.m    t3, v0
> +    bgez        t3, 3f
> +    mv          a0, zero             /* no difference */
> +    ret
> +3:  add a0, a0, t3
> +    add a1, a1, t3
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +.Ldiff:
> +    sub a0, t0, t1
> +    ret
> +
> +    /* ...no null terminator */
> +2:  vle8.v      v2, (a0), v0.t       /* load rest of lhs */
> +    vmnot.m     v0, v26              /* mask for rest of rhs */
> +    vle8.v      v4, (a1), v0.t       /* load rest of rhs */
> +    vmsne.vv    v0, v2, v4           /* compare */
> +    add         a0, a0, t1           /* advance ptrs */
> +    vfirst.m    t3, v0
> +    add         a1, a1, t1
> +    bltz        t3, 1b
> +
> +    sub t3, t3, t1 /* found difference but we've already advanced a0 and a1 */
> +    j 3b
> +
> +.Ltail:
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    bne t0, t1, .Ldiff
> +    addi a0, a0, 1
> +    addi a1, a1, 1
> +    bnez t0, .Ltail
> +    mv a0, zero
> +    ret
> +
> +
> +.size strcmp, .-strcmp
> +libc_hidden_builtin_def (strcmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S b/sysdeps/riscv/rv64/rvv/strcpy.S
> new file mode 100644
> index 0000000000..b21909d66f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
> @@ -0,0 +1,72 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strcpy
> +.type   strcpy,@function
> +
> +/*
> + *  optimized strcpy for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strcpy:
> +    mv          t0, a0                     /* copy dest so we can return it */
> +
> +    csrr        t1, vlenb                  /* find vlenb*2 */
> +    add         t1, t1, t1
> +
> +    addi        t2, t1, -1                 /* mask unaligned part of ptr */
> +    and         t2, a1, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                 /* search enough to align ptr */
> +    vsetvli     t2, t2, e8, m2, tu, mu
> +    vle8.v      v2, (a1)
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4                     /* copy but not past null */
> +    vfirst.m    t3, v4
> +    vse8.v      v2, (t0), v0.t
> +    bgez        t3, .Ldone
> +    add         t0, t0, t2
> +    add         a1, a1, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu   /* now do 2*vlenb bytes per pass */
> +
> +1:
> +    vle8.v      v2, (a1)
> +    add         a1, a1, t1
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4
> +    vfirst.m    t3, v4
> +    vse8.v      v2, (t0), v0.t
> +    add         t0, t0, t1
> +    bltz        t3, 1b
> +
> +.Ldone:
> +    ret
> +
> +.size   strcpy, .-strcpy
> +libc_hidden_builtin_def (strcpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c b/sysdeps/riscv/rv64/rvv/strcspn.c
> new file mode 100644
> index 0000000000..f0595a72fb
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* strcspn is implemented in strspn.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strlen.S b/sysdeps/riscv/rv64/rvv/strlen.S
> new file mode 100644
> index 0000000000..c77d500693
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strlen.S
> @@ -0,0 +1,67 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strlen
> +.type   strlen,@function
> +
> +/*
> + *  optimized strlen for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strlen:
> +    mv          t4, a0                    /* copy of buffer start */
> +    csrr        t1, vlenb                 /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1                /* mask off unaligned part of ptr */
> +    and         t2, a0, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                /* search fwd to align ptr */
> +    vsetvli     t2, t2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, ma  /* search 2*vlenb bytes per pass */
> +    add         t4, t4, t1
> +
> +1:
> +    vle8.v      v2, (a0)
> +    add         a0, a0, t1
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bltz        t3, 1b
> +
> +.Lfound:                                  /* found the 0; subtract          */
> +    sub         a0, a0, t4                /* buffer start from current ptr  */
> +    add         a0, a0, t3                /* and add offset into fetched    */
> +    ret                                   /* data to get length */
> +
> +.size   strlen, .-strlen
> +libc_hidden_builtin_def (strlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S b/sysdeps/riscv/rv64/rvv/strncmp.S
> new file mode 100644
> index 0000000000..863e5cb525
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
> @@ -0,0 +1,104 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strncmp
> +.type   strncmp,@function
> +
> +.align    2
> +
> +/* as strcmp, but with added checks on a2 (max count)
> + */
> +
> +strncmp:
> +    csrr        t1, vlenb                   /* find vlenb*2 */
> +    add         t1, t1, t1
> +    blt         a2, t1, .Ltail              /* degrade if max < vlenb*2 */
> +    vsetvli     zero, t1, e8, m2, ta, mu
> +    vid.v       v30
> +    addi        t2, t1, -1                  /* mask unaligned part of ptr */
> +    and         t6, a0, t2                  /* unaligned part of lhs */
> +    and         t5, a1, t2                  /* unaligned part of rhs */
> +    sub         t6, t1, t6                  /* safe count to read from lhs */
> +    sub         t5, t1, t5                  /* same, rhs */
> +    vmsltu.vx   v28, v30, t6                /* mask for first part of lhs */
> +    vmsltu.vx   v26, v30, t5                /* mask for first part of rhs */
> +    vmv.v.x     v16, zero
> +    vmv.v.x     v18, zero
> +
> +
> +1:  blt         a2, t1, .Ltail
> +    vmv.v.v     v0, v28                     /* lhs mask */
> +    vle8.v      v2, (a0), v0.t              /* masked load from lhs */
> +    vmseq.vx    v16, v2, zero, v0.t         /* check loaded bytes for null */
> +    vmv.v.v     v0, v26                     /*       rhs mask */
> +    vfirst.m    t2, v16                     /* get lhs check result */
> +    bgez        t2, .Ltail                  /* can we safely check rest */
> +    vle8.v      v4, (a1), v0.t              /*       masked load from rhs */
> +    vmseq.vx    v18, v4, zero, v0.t         /*       check partial rhs */
> +    vmnot.m     v0, v28                     /* mask for rest of lhs */
> +    vfirst.m    t3, v18                     /* get check result */
> +    bltz        t3, 2f                      /* test it */
> +    bge         t3, t6, .Ltail
> +
> +    vmsleu.vx   v0, v30, t3                 /* select rest of string + null */
> +    vmsne.vv    v0, v2, v4, v0.t            /* compare */
> +    vfirst.m    t3, v0
> +    bgez        t3, 3f
> +    mv          a0, zero
> +    ret
> +3:  add a0, a0, t3
> +    add a1, a1, t3
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +.Ldiff:
> +    sub a0, t0, t1
> +    ret
> +
> +    /* ...no null terminator in first part of lhs or rhs */
> +2:  vle8.v      v2, (a0), v0.t              /* load rest of lhs */
> +    vmnot.m     v0, v26                     /* mask for rest of rhs */
> +    vle8.v      v4, (a1), v0.t              /* load rest of rhs */
> +    vmsne.vv    v0, v2, v4                  /* compare */
> +    add         a0, a0, t1                  /* advance ptrs */
> +    vfirst.m    t3, v0
> +    add         a1, a1, t1
> +    sub         a2, a2, t1
> +    bltz        t3, 1b
> +
> +    sub t3, t3, t1  /* found a diff but we've already advanced a0 and a1 */
> +    j 3b
> +
> +.Ltail:
> +    beqz a2, 1f
> +    addi a2, a2, -1
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    bne t0, t1, .Ldiff
> +    addi a0, a0, 1
> +    addi a1, a1, 1
> +    bnez t0, .Ltail
> +1:  mv a0, zero
> +    ret
> +
> +
> +.size strncmp, .-strncmp
> +libc_hidden_builtin_def (strncmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S b/sysdeps/riscv/rv64/rvv/strncpy.S
> new file mode 100644
> index 0000000000..8b3a1e545c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
> @@ -0,0 +1,96 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strncpy
> +.type   strncpy,@function
> +
> +/*
> + *  optimized strcpy for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strncpy:
> +    mv          t0, a0                    /* need to return dest so copy */
> +
> +    csrr        t1, vlenb                 /* find vlenb*2 */
> +    add         t1, t1, t1
> +
> +    addi        t2, t1, -1                /* mask off unaligned part of ptr */
> +    and         t2, a1, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                /* search to align the pointer */
> +    vsetvli     zero, t2, e8, m2, tu, mu
> +    vle8.v      v2, (a1)
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4                    /* copy to dest */
> +    vfirst.m    t3, v4
> +    bgeu        t2, a2, .Ldest_full
> +    vse8.v      v2, (t0), v0.t
> +    bgez        t3, .Lterminator_found
> +    add         t0, t0, t2
> +    add         a1, a1, t2
> +    sub         a2, a2, t2
> +    beqz        a2, .Ldone
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:
> +    vle8.v      v2, (a1)
> +    add         a1, a1, t1
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4
> +    vfirst.m    t3, v4
> +    bgeu        t1, a2, .Ldest_full
> +    vse8.v      v2, (t0), v0.t
> +    add         t0, t0, t1
> +    sub         a2, a2, t1
> +    bltz        t3, 1b
> +    sub         t0, t0, t1
> +
> +.Lterminator_found:
> +    addi        sp, sp, -16
> +    sd          ra, 0(sp)
> +    sd          a0, 8(sp)
> +    add         a0, t0, t3
> +    mv          a1, zero
> +    sub         a2, a2, t3
> +    jal         ra, memset
> +    ld          ra, 0(sp)
> +    ld          a0, 8(sp)
> +    addi        sp, sp, 16
> +.Ldone:
> +    ret
> +
> +.Ldest_full:
> +    vid.v       v6
> +    vmsltu.vx   v4, v6, a2
> +    vmand.mm    v0, v0, v4
> +    vse8.v      v2, (t0), v0.t
> +    ret
> +
> +.size   strncpy, .-strncpy
> +libc_hidden_builtin_def (strncpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S b/sysdeps/riscv/rv64/rvv/strnlen.S
> new file mode 100644
> index 0000000000..6d7ee65c7a
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
> @@ -0,0 +1,81 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  __strnlen
> +.type   __strnlen,@function
> +
> +/* vector optimized strnlen
> + * assume it's safe to read to the end of the page
> + * containing either a null terminator or the last byte of the count or both,
> + * but not past it
> + * assume page size >= vlenb*2
> + */
> +
> +.align    2
> +__strnlen:
> +    mv          t4, a0               /* stash a copy of start for later */
> +    beqz        a1, .LzeroCount
> +
> +    csrr        t1, vlenb            /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1           /* mask off unaligned part of ptr */
> +    and         t2, a1, a0
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2           /* search to align pointer to t1 */
> +    bgeu        t2, a1, 2f           /* check it's safe */
> +    mv          t2, a1               /* it's not! look as far as permitted */
> +2:  vsetvli     t2, t2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t2
> +    sub         a1, a1, t2
> +    bltu        a1, t1, .LreachedCount
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, ma    /* do 2*vlenb bytes per pass */
> +
> +1:  vle8.v      v2, (a0)
> +    sub         a1, a1, t1
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t1
> +    bgeu        a1, t1, 1b
> +.LreachedCount:
> +    mv          t2, a1    /* in case 0 < a1 < t1 */
> +    bnez        a1, 2b    /* if so, still t2 bytes to check, all safe */
> +.LzeroCount:
> +    sub         a0, a0, t4
> +    ret
> +
> +.Lfound:        /* found the 0; subtract buffer start from current pointer */
> +    add         a0, a0, t3 /* and add offset into fetched data */
> +    sub         a0, a0, t4
> +    ret
> +
> +.size   __strnlen, .-__strnlen
> +weak_alias (__strnlen, strnlen)
> +libc_hidden_builtin_def (__strnlen)
> +libc_hidden_builtin_def (strnlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S b/sysdeps/riscv/rv64/rvv/strrchr.S
> new file mode 100644
> index 0000000000..4bef8a3b9c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
> @@ -0,0 +1,88 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strrchr
> +.type   strrchr,@function
> +
> +/*
> + *  optimized strrchr for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strrchr:
> +    mv          t5, a0    /* stash buffer ptr somewhere safe */
> +    mv          a0, zero  /* result is nullptr unless we find better below */
> +
> +    csrr        t1, vlenb                /* determine vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1               /* mask off unaligned part of ptr */
> +    and         t2, t5, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2               /* search to align ptr to 2*vlenb */
> +    vsetvli     t2, t2, e8, m2, ta, mu
> +
> +    vle8.v      v2, (t5)                 /* load data into v2(,v3) */
> +    vmseq.vx    v4, v2, zero             /* check for null terminator */
> +    vfirst.m    t4, v4                   /* grab its position, if any */
> +    vmsbf.m     v0, v4                   /* select valid chars */
> +    vmseq.vx    v0, v2, a1, v0.t         /* search for candidate byte */
> +    vfirst.m    t3, v0                   /* grab its position, if any */
> +    bltz        t3, 2f                   /* did we find a candidate? */
> +
> +3:  add         a0, t3, t5               /* we did! grab the address */
> +    vmsof.m     v1, v0                   /* there might be more than one */
> +    vmandn.mm   v0, v0, v1               /* so clear the one we just found */
> +    vfirst.m    t3, v0                   /* is there another? */
> +    bgez        t3, 3b
> +
> +2:  bgez        t4, .Ldone               /* did we see a null terminator? */
> +    add         t5, t5, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:  vle8.v      v2, (t5)
> +    vmseq.vx    v4, v2, zero
> +    vfirst.m    t4, v4
> +    vmsbf.m     v0, v4
> +    vmseq.vx    v0, v2, a1, v0.t
> +    vfirst.m    t3, v0
> +    bltz        t3, 2f
> +
> +3:  add         a0, t3, t5
> +    vmsof.m     v1, v0
> +    vmandn.mm   v0, v0, v1
> +    vfirst.m    t3, v0
> +    bgez        t3, 3b
> +
> +2:  add         t5, t5, t1
> +    bltz        t4, 1b
> +
> +.Ldone:
> +    ret
> +
> +.size   strrchr, .-strrchr
> +libc_hidden_builtin_def (strrchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strspn.S b/sysdeps/riscv/rv64/rvv/strspn.S
> new file mode 100644
> index 0000000000..2b9af5cc2d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strspn.S
> @@ -0,0 +1,189 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strspn
> +.type   strspn,@function
> +
> +.globl  strcspn
> +.type   strcspn,@function
> +
> +/*
> + *  optimized strspn / strcspn for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 32
> + *  strategy:
> + *  - build a 256-bit table on the stack, where each elt is zero
> + *    if encountering it should terminate computation and nonzero otherwise
> + *  - use vectorised lookups into this to check 2*vlen elts at a time;
> + *    this code is identical for strspan and strcspan and can be shared
> + *
> + *  note that while V mandates at least 128 bit wide regs,
> + *  we are building a 256 bit lookup table
> + *  therefore we use either LMUL=1 or 2 depending on what the target supports
> + *  therefore we only use even vector register numbers,
> + *  so everything still works if we go with LMUL=2
> + */
> +
> +# -----------------------------
> +
> +.align    2
> +
> +strspn:
> +    lbu         t0, 0(a1)
> +    bnez        t0, .Lbuild_table
> +    mv          a0, zero
> +    ret
> +
> +.Lbuild_table:
> +    mv          a6, a0 /* store incoming a0 */
> +    li          t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
> +
> +    vsetvli     zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> +    /* we want to build a 256-bit table, so use vlenb*2,
> +     * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
> +     * 'V' extension specifies a minimum vlen of 128 so this should cover
> +     * all cases; we can skip the check if we know vlen >= 256 at compile time
> +     */
> +    csrr        t2, vlenb
> +    bgeu        t2, t1, 1f
> +    vsetvli     zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> +    /* read one char from the charset at a time and write the correct bit
> +     * in the lookup table; we could do SIMD iff we ever get an extension
> +     * that provides some way of scattering bytes into a reg group
> +     */
> +    vmv.v.x     v16, zero       /* clear out table */
> +    vmv.v.x     v8, zero        /* clear out v8 */
> +    li          t3, 1
> +    vmv.s.x     v8, t3          /* v8 now all zeroes except bottom byte */
> +
> +1:  vmv.v.x     v2, zero        /* clear out v2 */
> +    addi        a1, a1, 1       /* advance charset ptr */
> +    srli        t2, t0, 3       /* divide the byte we read earlier by 8 */
> +    vslideup.vx v2, v8, t2      /* v2 now 1 in the correct byte 0 elsewhere */
> +    vsll.vx     v2, v2, t0      /* v2 now 1 in the correct bit, 0 elsewhere */
> +    vor.vv      v16, v16, v2    /* or it in */
> +    lbu         t0, 0(a1)       /* fetch next bute */
> +    bnez        t0, 1b          /* if it's null, go round again */
> +
> +/*
> + *   Table is now built in v16.
> + *   Strategy:
> + *   - fetch next t1 bytes from memory
> + *   - vrgather on their values divided by 8 to get relevant bytes of table
> + *   - shift right to get the correct bit into bit 1
> + *   - and with 1, compare with expected terminator value, then check mask
> + *     to see if we've found a terminator
> + *
> + *   Before we can begin, a0 needs to be t1-aligned, so that when we fetch
> + *   the next t1 bytes - any of which may be the null terminator -
> + *   we do not cross a page boundary and read unmapped memory. Therefore
> + *   we have one read of however many bytes are needed to align a0,
> + *   before the main loop.
> + */
> +
> +.Lscan_table:
> +    vmv.v.x     v8, t3              /* v8 now t1 bytes of 0x01 */
> +
> +    and         t2, a0, t1          /* mask to align to t1 */
> +    beqz        t2, 2f              /* or skip if we're already aligned */
> +    sub         t2, t1, t2          /* t2 now bytes to read to align to t1 */
> +
> +    vid.v       v2                  /* build mask instead of changing vl */
> +    vmsltu.vx   v0, v2, t2          /* so we don't need to track LMUL */
> +
> +    vle8.v      v2, (a0), v0.t      /* load next bytes from input */
> +    vsrl.vi     v4, v2, 3           /* divide by 8 */
> +    vrgather.vv v6, v16, v4         /* corresponding bytes of bit table */
> +    vsrl.vv     v6, v6, v2          /* shift correct bits to lsb */
> +    vand.vv     v6, v6, v8          /* and with 1 to complete the lookups */
> +    vmseq.vx    v4, v6, zero, v0.t  /* check to see if any 0s are present */
> +    vfirst.m    t0, v4              /* index of the first 0, if any */
> +    bgez        t0, .Lscan_end      /* if we found one, stop */
> +    add         a0, a0, t2          /* advance by number of bytes we read */
> +
> +2:  add         a6, a6, t1     /* we'll advance a0 before the exit check */
> +1:  vle8.v      v2, (a0)       /* as above but unmasked so t1 elts per pass */
> +    add         a0, a0, t1
> +
> +    vsrl.vi     v4, v2, 3
> +    vrgather.vv v6, v16, v4
> +    vsrl.vv     v6, v6, v2
> +    vand.vv     v6, v6, v8
> +
> +    vmseq.vx    v4, v6, zero
> +    vfirst.m    t0, v4
> +    bltz        t0, 1b
> +
> +.Lscan_end:
> +    add         a0, a0, t0     /* calculate offset to terminating byte */
> +    sub         a0, a0, a6
> +    ret
> +.size   strspn, .-strspn
> +
> +/* strcspn
> + *
> + * table build exactly as for strspn, except:
> + * - the lookup table starts with all bits except bit 0 of byte 0 set
> + * - we clear the corresponding bit for each byte in the charset
> + * once table is built, we can reuse the scan code directly
> + */
> +
> +strcspn:
> +    lbu         t0, 0(a1)
> +    beqz        t0, strlen   /* no rejections -> prefix is whole string */
> +
> +    mv          a6, a0
> +    li          t1, 32
> +
> +    vsetvli     zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> +    csrr        t2, vlenb
> +    bgeu        t2, t1, 1f
> +    vsetvli     zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> +    vmv.v.x     v8, zero
> +    li          t3, 1           /* all bits clear except bit 0 of byte 0 */
> +    vmv.s.x     v8, t3
> +    vnot.v      v16, v8         /* v16 is the inverse of that */
> +    li          t4, -1
> +
> +1:  vmv.v.x     v2, zero
> +    addi        a1, a1, 1       /* advance charset ptr */
> +    srli        t2, t0, 3       /* select correct bit in v2 */
> +    vslideup.vx v2, v8, t2
> +    vsll.vx     v2, v2, t0
> +    vnot.v      v2, v2          /* invert */
> +    vand.vv     v16, v16, v2    /* clear the relevant bit of table */
> +    lbu         t0, 0(a1)
> +    bnez        t0, 1b
> +    j           .Lscan_table
> +.size   strcspn, .-strcspn
> +
> +libc_hidden_builtin_def (strspn)
> +libc_hidden_builtin_def (strcspn)
> \ No newline at end of file
> --
> 2.34.1
>
  
Noah Goldstein Feb. 1, 2023, 6:13 p.m. UTC | #7
On Wed, Feb 1, 2023 at 11:38 AM Adhemerval Zanella Netto via
Libc-alpha <libc-alpha@sourceware.org> wrote:
>
>
>
> On 01/02/23 06:52, Sergei Lewis wrote:
> > Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> > strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> > targeting the riscv "V" extension, version 1.0
> >
> > The vectorised implementations assume VLENB of at least 128 and at least 32
> > registers (as mandated by the "V" extension spec). They also assume that
> > VLENB is a power of two which is no larger than the page size, and (as
> > vectorised code in glibc for other platforms does) that it is safe to read
> > past null terminators / buffer ends provided one does not cross a page
> > boundary.
> >
> > Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
>
> Some comments that might be useful since I am working the generic implementations
> below.
>
> Also, I think it should be splitted with one implementation per patch, unless the
> implementation is tied together (as for strchr/strchrnul for instance).  Does
> the vectorized routine only work for rv64?
>
> > ---
> >  sysdeps/riscv/rv64/rvv/Implies     |   2 +
> >  sysdeps/riscv/rv64/rvv/memchr.S    | 127 +++++++++++++++++++
> >  sysdeps/riscv/rv64/rvv/memcmp.S    |  93 ++++++++++++++
> >  sysdeps/riscv/rv64/rvv/memcpy.S    | 154 +++++++++++++++++++++++
> >  sysdeps/riscv/rv64/rvv/memmove.c   |  22 ++++
> >  sysdeps/riscv/rv64/rvv/memset.S    |  89 ++++++++++++++
> >  sysdeps/riscv/rv64/rvv/strchr.S    |  92 ++++++++++++++
> >  sysdeps/riscv/rv64/rvv/strchrnul.c |  22 ++++
> >  sysdeps/riscv/rv64/rvv/strcmp.S    | 108 +++++++++++++++++
> >  sysdeps/riscv/rv64/rvv/strcpy.S    |  72 +++++++++++
> >  sysdeps/riscv/rv64/rvv/strcspn.c   |  22 ++++
> >  sysdeps/riscv/rv64/rvv/strlen.S    |  67 ++++++++++
> >  sysdeps/riscv/rv64/rvv/strncmp.S   | 104 ++++++++++++++++
> >  sysdeps/riscv/rv64/rvv/strncpy.S   |  96 +++++++++++++++
> >  sysdeps/riscv/rv64/rvv/strnlen.S   |  81 +++++++++++++
> >  sysdeps/riscv/rv64/rvv/strrchr.S   |  88 ++++++++++++++
> >  sysdeps/riscv/rv64/rvv/strspn.S    | 189 +++++++++++++++++++++++++++++
> >  17 files changed, 1428 insertions(+)
> >  create mode 100644 sysdeps/riscv/rv64/rvv/Implies
> >  create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
> >  create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
> >
> > diff --git a/sysdeps/riscv/rv64/rvv/Implies b/sysdeps/riscv/rv64/rvv/Implies
> > new file mode 100644
> > index 0000000000..b07b4cb906
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/Implies
> > @@ -0,0 +1,2 @@
> > +riscv/rv64/rvd
> > +
> > diff --git a/sysdeps/riscv/rv64/rvv/memchr.S b/sysdeps/riscv/rv64/rvv/memchr.S
> > new file mode 100644
> > index 0000000000..a7e32b8f25
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/memchr.S
> > @@ -0,0 +1,127 @@
> > +
>
> Spurious new line at the start.  We also require a brief comment describing
> the file contents for newer files.
>
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
>
> Not sure 2012 range fits here.
>
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +
> > +/* Optimised memchr for riscv with vector extension
> > + * Assumptions:
> > + *    - cpu becomes bandwidth limited at or before
> > + *            2 vector register sized read/write operations
> > + *          + 2 scalar operations
> > + *          + conditional branch
> > + */
> > +
> > +.globl  memchr
> > +.type   memchr,@function
> > +
> > +.align    2
> > +memchr:
>
> We have the ENTRY macro for that.
>
> > +    beqz a2, .Lnot_found
>
> Maybe use the L macro here for local labels;
>
> > +    csrr    t1, vlenb
> > +    bgeu    a2, t1, .Lvector_path   /* only use vector path if we're scanning
> > +                                       at least vlenb bytes */
> > +
> > +#ifndef __riscv_strict_align
>
> Would this be defined by compiler as predefine macro or is it just a debug
> switch? If the later, I think it would be better to remove it.
>
> > +    li a3, 8
> > +    blt a2, a3, .Lbytewise
> > +
> > +    li      t1, 0x0101010101010101
> > +    slli    a4, t1, 7     /* a4 = 0x8080808080808080 */
> > +    mul     t2, a1, t1    /* entirety of t2 is now repeats of target character;
> > +                             assume mul is at worst no worse than 3*(shift+OR),
> > +                             otherwise do that instead */
> > +
> > +/*
> > + * strategy:
> > + * t4 = ((*a0) ^ t2)
> > + *      - now t4 contains zero bytes if and only if next word of memory
> > + *        had target character at those positions
> > + *
> > + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
> > + *      - all nonzero bytes of t4 become 0; zero bytes become 0x80
> > + *
> > + * if t4 is nonzero, find the index of the byte within it, add to a0 and return
> > + * otherwise, loop
> > + */
> > +
> > +1:
> > +    ld t4, (a0)          /* t4 = load next 8 bytes */
> > +    xor t4, t4, t2
> > +    sub t5, t4, t1
> > +    not t4, t4
> > +    and t4, t5, t4
> > +    and t4, t4, a4
> > +    bnez t4, .Lbytewise  /* could use ctzw, mod+lookup or just binary chop
> > +                            to locate byte of interest in t4 but profiling
> > +                            shows these approaches are at best no better */
> > +    addi a2, a2, -8
> > +    addi a0, a0, 8
> > +    bgeu a2, a3, 1b
> > +    beqz a2, .Lnot_found
> > +#endif // __riscv_strict_align
> > +
> > +/* too little data for a dword. mask calculation and branch mispredict costs
> > +   make checking a word not worthwhile. degrade to bytewise search. */
> > +
> > +.Lbytewise:
> > +    add t2, a0, a2
> > +
> > +1:
> > +    lb t1, (a0)
> > +    beq t1, a1, .Lfound
> > +    addi a0, a0, 1
> > +    blt a0, t2, 1b
> > +
> > +.Lnot_found:
> > +    mv a0, zero
> > +.Lfound:
> > +    ret
> > +
> > +.Lvector_path:
> > +    vsetvli t2, a2, e8, m2, ta, ma
> > +
> > +1:
> > +    vle8.v      v2, (a0)
> > +    vmseq.vx    v0, v2, a1
> > +    vfirst.m    t3, v0
> > +    bgez        t3, .Lvec_found
> > +    add         a0, a0, t2
> > +    sub         a2, a2, t2
> > +    bge         a2, t2, 1b
> > +    bnez        a2, 2f
> > +    mv          a0, zero
> > +    ret
> > +
> > +2:
> > +    vsetvli t2, a2, e8, m2, ta, ma
> > +    vle8.v      v2, (a0)
> > +    vmseq.vx    v0, v2, a1
> > +    vfirst.m    t3, v0
> > +    bgez        t3, .Lvec_found
> > +    mv          a0, zero
> > +    ret
> > +
> > +.Lvec_found:
> > +    add a0, a0, t3
> > +    ret
> > +
> > +.size   memchr, .-memchr
> > +libc_hidden_builtin_def (memchr)
> > \ No newline at end of file
>
> Please add a newline.
>
> > diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S b/sysdeps/riscv/rv64/rvv/strcpy.S
> > new file mode 100644
> > index 0000000000..b21909d66f
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
> > @@ -0,0 +1,72 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
>
> You can add a optimize stpcpy and use to implement strcpy on top of that
> (as my generic proposal does [1]). ARMv6 does something similar [2]
>
> [1] https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-12-adhemerval.zanella@linaro.org/
> [2] https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/arm/armv6/strcpy.S;h=e9f63a56c1c605a21b05f7ac21412585b0705171;hb=HEAD
>
> > +#include <sysdep.h>
> > +
> > +.globl  strcpy
> > +.type   strcpy,@function
> > +
> > +/*
> > + *  optimized strcpy for riscv with vector extension
> > + *  assumptions:
> > + *  - vlenb is a power of 2
> > + *  - page size >= 2*vlenb
> > + */
> > +
> > +.align    2
> > +strcpy:
> > +    mv          t0, a0                     /* copy dest so we can return it */
> > +
> > +    csrr        t1, vlenb                  /* find vlenb*2 */
> > +    add         t1, t1, t1
> > +
> > +    addi        t2, t1, -1                 /* mask unaligned part of ptr */
> > +    and         t2, a1, t2
> > +    beqz        t2, .Laligned
> > +
> > +    sub         t2, t1, t2                 /* search enough to align ptr */
> > +    vsetvli     t2, t2, e8, m2, tu, mu
> > +    vle8.v      v2, (a1)
> > +    vmseq.vx    v4, v2, zero
> > +    vmsif.m     v0, v4                     /* copy but not past null */
> > +    vfirst.m    t3, v4
> > +    vse8.v      v2, (t0), v0.t
> > +    bgez        t3, .Ldone
> > +    add         t0, t0, t2
> > +    add         a1, a1, t2
> > +
> > +.Laligned:
> > +    vsetvli     zero, t1, e8, m2, ta, mu   /* now do 2*vlenb bytes per pass */
> > +
> > +1:
> > +    vle8.v      v2, (a1)
> > +    add         a1, a1, t1
> > +    vmseq.vx    v4, v2, zero
> > +    vmsif.m     v0, v4
> > +    vfirst.m    t3, v4
> > +    vse8.v      v2, (t0), v0.t
> > +    add         t0, t0, t1
> > +    bltz        t3, 1b
> > +
> > +.Ldone:
> > +    ret
> > +
> > +.size   strcpy, .-strcpy
> > +libc_hidden_builtin_def (strcpy)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c b/sysdeps/riscv/rv64/rvv/strcspn.c
> > new file mode 100644
> > index 0000000000..f0595a72fb
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
> > @@ -0,0 +1,22 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +/* strcspn is implemented in strspn.S
> > + */
> > diff --git a/sysdeps/riscv/rv64/rvv/strlen.S b/sysdeps/riscv/rv64/rvv/strlen.S
> > new file mode 100644
> > index 0000000000..c77d500693
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strlen.S
> > @@ -0,0 +1,67 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl  strlen
> > +.type   strlen,@function
> > +
> > +/*
> > + *  optimized strlen for riscv with vector extension
> > + *  assumptions:
> > + *  - vlenb is a power of 2
> > + *  - page size >= 2*vlenb
> > + */
> > +
> > +.align    2
> > +strlen:
> > +    mv          t4, a0                    /* copy of buffer start */
> > +    csrr        t1, vlenb                 /* find vlenb*2 */
> > +    add         t1, t1, t1
> > +    addi        t2, t1, -1                /* mask off unaligned part of ptr */
> > +    and         t2, a0, t2
> > +    beqz        t2, .Laligned
> > +
> > +    sub         t2, t1, t2                /* search fwd to align ptr */
> > +    vsetvli     t2, t2, e8, m2, ta, ma
> > +    vle8.v      v2, (a0)
> > +    vmseq.vx    v0, v2, zero
> > +    vfirst.m    t3, v0
> > +    bgez        t3, .Lfound
> > +    add         a0, a0, t2
> > +
> > +.Laligned:
> > +    vsetvli     zero, t1, e8, m2, ta, ma  /* search 2*vlenb bytes per pass */
> > +    add         t4, t4, t1
> > +
> > +1:
> > +    vle8.v      v2, (a0)
> > +    add         a0, a0, t1
> > +    vmseq.vx    v0, v2, zero
> > +    vfirst.m    t3, v0
> > +    bltz        t3, 1b
> > +
> > +.Lfound:                                  /* found the 0; subtract          */
> > +    sub         a0, a0, t4                /* buffer start from current ptr  */
> > +    add         a0, a0, t3                /* and add offset into fetched    */
> > +    ret                                   /* data to get length */
> > +
> > +.size   strlen, .-strlen
> > +libc_hidden_builtin_def (strlen)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S b/sysdeps/riscv/rv64/rvv/strncmp.S
> > new file mode 100644
> > index 0000000000..863e5cb525
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
> > @@ -0,0 +1,104 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl  strncmp
> > +.type   strncmp,@function
> > +
> > +.align    2
> > +
> > +/* as strcmp, but with added checks on a2 (max count)
> > + */
> > +
> > +strncmp:
> > +    csrr        t1, vlenb                   /* find vlenb*2 */
> > +    add         t1, t1, t1
> > +    blt         a2, t1, .Ltail              /* degrade if max < vlenb*2 */
> > +    vsetvli     zero, t1, e8, m2, ta, mu
> > +    vid.v       v30
> > +    addi        t2, t1, -1                  /* mask unaligned part of ptr */
> > +    and         t6, a0, t2                  /* unaligned part of lhs */
> > +    and         t5, a1, t2                  /* unaligned part of rhs */
> > +    sub         t6, t1, t6                  /* safe count to read from lhs */
> > +    sub         t5, t1, t5                  /* same, rhs */
> > +    vmsltu.vx   v28, v30, t6                /* mask for first part of lhs */
> > +    vmsltu.vx   v26, v30, t5                /* mask for first part of rhs */
> > +    vmv.v.x     v16, zero
> > +    vmv.v.x     v18, zero
> > +
> > +
> > +1:  blt         a2, t1, .Ltail
> > +    vmv.v.v     v0, v28                     /* lhs mask */
> > +    vle8.v      v2, (a0), v0.t              /* masked load from lhs */
> > +    vmseq.vx    v16, v2, zero, v0.t         /* check loaded bytes for null */
> > +    vmv.v.v     v0, v26                     /*       rhs mask */
> > +    vfirst.m    t2, v16                     /* get lhs check result */
> > +    bgez        t2, .Ltail                  /* can we safely check rest */
> > +    vle8.v      v4, (a1), v0.t              /*       masked load from rhs */
> > +    vmseq.vx    v18, v4, zero, v0.t         /*       check partial rhs */
> > +    vmnot.m     v0, v28                     /* mask for rest of lhs */
> > +    vfirst.m    t3, v18                     /* get check result */
> > +    bltz        t3, 2f                      /* test it */
> > +    bge         t3, t6, .Ltail
> > +
> > +    vmsleu.vx   v0, v30, t3                 /* select rest of string + null */
> > +    vmsne.vv    v0, v2, v4, v0.t            /* compare */
> > +    vfirst.m    t3, v0
> > +    bgez        t3, 3f
> > +    mv          a0, zero
> > +    ret
> > +3:  add a0, a0, t3
> > +    add a1, a1, t3
> > +    lbu t0, (a0)
> > +    lbu t1, (a1)
> > +.Ldiff:
> > +    sub a0, t0, t1
> > +    ret
> > +
> > +    /* ...no null terminator in first part of lhs or rhs */
> > +2:  vle8.v      v2, (a0), v0.t              /* load rest of lhs */
> > +    vmnot.m     v0, v26                     /* mask for rest of rhs */
> > +    vle8.v      v4, (a1), v0.t              /* load rest of rhs */
> > +    vmsne.vv    v0, v2, v4                  /* compare */
> > +    add         a0, a0, t1                  /* advance ptrs */
> > +    vfirst.m    t3, v0
> > +    add         a1, a1, t1
> > +    sub         a2, a2, t1
> > +    bltz        t3, 1b
> > +
> > +    sub t3, t3, t1  /* found a diff but we've already advanced a0 and a1 */
> > +    j 3b
> > +
> > +.Ltail:
> > +    beqz a2, 1f
> > +    addi a2, a2, -1
> > +    lbu t0, (a0)
> > +    lbu t1, (a1)
> > +    bne t0, t1, .Ldiff
> > +    addi a0, a0, 1
> > +    addi a1, a1, 1
> > +    bnez t0, .Ltail
> > +1:  mv a0, zero
> > +    ret
> > +
> > +
> > +.size strncmp, .-strncmp
> > +libc_hidden_builtin_def (strncmp)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S b/sysdeps/riscv/rv64/rvv/strncpy.S
> > new file mode 100644
> > index 0000000000..8b3a1e545c
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
> > @@ -0,0 +1,96 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl  strncpy
> > +.type   strncpy,@function
> > +
> > +/*
> > + *  optimized strcpy for riscv with vector extension
> > + *  assumptions:
> > + *  - vlenb is a power of 2
> > + *  - page size >= 2*vlenb
> > + */
> > +
> > +.align    2
> > +strncpy:
> > +    mv          t0, a0                    /* need to return dest so copy */
> > +
> > +    csrr        t1, vlenb                 /* find vlenb*2 */
> > +    add         t1, t1, t1
> > +
> > +    addi        t2, t1, -1                /* mask off unaligned part of ptr */
> > +    and         t2, a1, t2
> > +    beqz        t2, .Laligned
> > +
> > +    sub         t2, t1, t2                /* search to align the pointer */
> > +    vsetvli     zero, t2, e8, m2, tu, mu
> > +    vle8.v      v2, (a1)
> > +    vmseq.vx    v4, v2, zero
> > +    vmsif.m     v0, v4                    /* copy to dest */
> > +    vfirst.m    t3, v4
> > +    bgeu        t2, a2, .Ldest_full
> > +    vse8.v      v2, (t0), v0.t
> > +    bgez        t3, .Lterminator_found
> > +    add         t0, t0, t2
> > +    add         a1, a1, t2
> > +    sub         a2, a2, t2
> > +    beqz        a2, .Ldone
> > +
> > +.Laligned:
> > +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> > +
> > +1:
> > +    vle8.v      v2, (a1)
> > +    add         a1, a1, t1
> > +    vmseq.vx    v4, v2, zero
> > +    vmsif.m     v0, v4
> > +    vfirst.m    t3, v4
> > +    bgeu        t1, a2, .Ldest_full
> > +    vse8.v      v2, (t0), v0.t
> > +    add         t0, t0, t1
> > +    sub         a2, a2, t1
> > +    bltz        t3, 1b
> > +    sub         t0, t0, t1
> > +
> > +.Lterminator_found:
> > +    addi        sp, sp, -16
> > +    sd          ra, 0(sp)
> > +    sd          a0, 8(sp)
> > +    add         a0, t0, t3
> > +    mv          a1, zero
> > +    sub         a2, a2, t3
> > +    jal         ra, memset
> > +    ld          ra, 0(sp)
> > +    ld          a0, 8(sp)
> > +    addi        sp, sp, 16
> > +.Ldone:
> > +    ret
> > +
> > +.Ldest_full:
> > +    vid.v       v6
> > +    vmsltu.vx   v4, v6, a2
> > +    vmand.mm    v0, v0, v4
> > +    vse8.v      v2, (t0), v0.t
> > +    ret
> > +
> > +.size   strncpy, .-strncpy
> > +libc_hidden_builtin_def (strncpy)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S b/sysdeps/riscv/rv64/rvv/strnlen.S
> > new file mode 100644
> > index 0000000000..6d7ee65c7a
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
> > @@ -0,0 +1,81 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <sysdep.h>
> > +
>
> Maybe use a generic implementation that issues memchr (which should be optimized
> using vector instructions) [3] ?  It would be a extra function call, but it should really
> help on both code size and icache pressure.
>
> [3] https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-6-adhemerval.zanella@linaro.org/
>
> > +.globl  __strnlen
> > +.type   __strnlen,@function
> > +
> > +/* vector optimized strnlen
> > + * assume it's safe to read to the end of the page
> > + * containing either a null terminator or the last byte of the count or both,
> > + * but not past it
> > + * assume page size >= vlenb*2
> > + */
> > +
> > +.align    2
> > +__strnlen:
> > +    mv          t4, a0               /* stash a copy of start for later */
> > +    beqz        a1, .LzeroCount
> > +
> > +    csrr        t1, vlenb            /* find vlenb*2 */
> > +    add         t1, t1, t1
> > +    addi        t2, t1, -1           /* mask off unaligned part of ptr */
> > +    and         t2, a1, a0
> > +    beqz        t2, .Laligned
> > +
> > +    sub         t2, t1, t2           /* search to align pointer to t1 */
> > +    bgeu        t2, a1, 2f           /* check it's safe */
> > +    mv          t2, a1               /* it's not! look as far as permitted */
> > +2:  vsetvli     t2, t2, e8, m2, ta, ma
> > +    vle8.v      v2, (a0)
> > +    vmseq.vx    v0, v2, zero
> > +    vfirst.m    t3, v0
> > +    bgez        t3, .Lfound
> > +    add         a0, a0, t2
> > +    sub         a1, a1, t2
> > +    bltu        a1, t1, .LreachedCount
> > +
> > +.Laligned:
> > +    vsetvli     zero, t1, e8, m2, ta, ma    /* do 2*vlenb bytes per pass */
> > +
> > +1:  vle8.v      v2, (a0)
> > +    sub         a1, a1, t1
> > +    vmseq.vx    v0, v2, zero
> > +    vfirst.m    t3, v0
> > +    bgez        t3, .Lfound
> > +    add         a0, a0, t1
> > +    bgeu        a1, t1, 1b
> > +.LreachedCount:
> > +    mv          t2, a1    /* in case 0 < a1 < t1 */
> > +    bnez        a1, 2b    /* if so, still t2 bytes to check, all safe */
> > +.LzeroCount:
> > +    sub         a0, a0, t4
> > +    ret
> > +
> > +.Lfound:        /* found the 0; subtract buffer start from current pointer */
> > +    add         a0, a0, t3 /* and add offset into fetched data */
> > +    sub         a0, a0, t4
> > +    ret
> > +
> > +.size   __strnlen, .-__strnlen
> > +weak_alias (__strnlen, strnlen)
> > +libc_hidden_builtin_def (__strnlen)
> > +libc_hidden_builtin_def (strnlen)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S b/sysdeps/riscv/rv64/rvv/strrchr.S
> > new file mode 100644
> > index 0000000000..4bef8a3b9c
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
> > @@ -0,0 +1,88 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <sysdep.h>
>
> It is really worth to add a strrchr optimization?  The generic implementation
> already calls strchr (which should be optimized).
>
> > +
> > +.globl  strrchr
> > +.type   strrchr,@function
> > +
> > +/*
> > + *  optimized strrchr for riscv with vector extension
> > + *  assumptions:
> > + *  - vlenb is a power of 2
> > + *  - page size >= 2*vlenb
> > + */
> > +
> > +.align    2
> > +strrchr:
> > +    mv          t5, a0    /* stash buffer ptr somewhere safe */
> > +    mv          a0, zero  /* result is nullptr unless we find better below */
> > +
> > +    csrr        t1, vlenb                /* determine vlenb*2 */
> > +    add         t1, t1, t1
> > +    addi        t2, t1, -1               /* mask off unaligned part of ptr */
> > +    and         t2, t5, t2
> > +    beqz        t2, .Laligned
> > +
> > +    sub         t2, t1, t2               /* search to align ptr to 2*vlenb */
> > +    vsetvli     t2, t2, e8, m2, ta, mu
> > +
> > +    vle8.v      v2, (t5)                 /* load data into v2(,v3) */
> > +    vmseq.vx    v4, v2, zero             /* check for null terminator */
> > +    vfirst.m    t4, v4                   /* grab its position, if any */
> > +    vmsbf.m     v0, v4                   /* select valid chars */
> > +    vmseq.vx    v0, v2, a1, v0.t         /* search for candidate byte */
> > +    vfirst.m    t3, v0                   /* grab its position, if any */
> > +    bltz        t3, 2f                   /* did we find a candidate? */
> > +
> > +3:  add         a0, t3, t5               /* we did! grab the address */
> > +    vmsof.m     v1, v0                   /* there might be more than one */
> > +    vmandn.mm   v0, v0, v1               /* so clear the one we just found */
> > +    vfirst.m    t3, v0                   /* is there another? */
> > +    bgez        t3, 3b
> > +
> > +2:  bgez        t4, .Ldone               /* did we see a null terminator? */
> > +    add         t5, t5, t2
> > +
> > +.Laligned:
> > +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> > +
> > +1:  vle8.v      v2, (t5)
> > +    vmseq.vx    v4, v2, zero
> > +    vfirst.m    t4, v4
> > +    vmsbf.m     v0, v4
> > +    vmseq.vx    v0, v2, a1, v0.t
> > +    vfirst.m    t3, v0
> > +    bltz        t3, 2f
> > +
> > +3:  add         a0, t3, t5
> > +    vmsof.m     v1, v0
> > +    vmandn.mm   v0, v0, v1
> > +    vfirst.m    t3, v0
> > +    bgez        t3, 3b
> > +
> > +2:  add         t5, t5, t1
> > +    bltz        t4, 1b
> > +
> > +.Ldone:
> > +    ret
> > +
> > +.size   strrchr, .-strrchr
> > +libc_hidden_builtin_def (strrchr)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strspn.S b/sysdeps/riscv/rv64/rvv/strspn.S
> > new file mode 100644
> > index 0000000000..2b9af5cc2d
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strspn.S
> > @@ -0,0 +1,189 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl  strspn
> > +.type   strspn,@function
> > +
> > +.globl  strcspn
> > +.type   strcspn,@function
> > +
> > +/*
> > + *  optimized strspn / strcspn for riscv with vector extension
> > + *  assumptions:
> > + *  - vlenb is a power of 2
> > + *  - page size >= 32
> > + *  strategy:
> > + *  - build a 256-bit table on the stack, where each elt is zero
> > + *    if encountering it should terminate computation and nonzero otherwise
> > + *  - use vectorised lookups into this to check 2*vlen elts at a time;
> > + *    this code is identical for strspan and strcspan and can be shared
> > + *
> > + *  note that while V mandates at least 128 bit wide regs,
> > + *  we are building a 256 bit lookup table
> > + *  therefore we use either LMUL=1 or 2 depending on what the target supports
> > + *  therefore we only use even vector register numbers,
> > + *  so everything still works if we go with LMUL=2
> > + */
> > +
>
> I wonder if we could adapt the generic implementation, so riscv only reimplements
> the vectorized search instead of all the boilerplace to generate the table and
> early tests.

+1
>
> > +# -----------------------------
> > +
> > +.align    2
> > +
> > +strspn:
> > +    lbu         t0, 0(a1)
> > +    bnez        t0, .Lbuild_table
> > +    mv          a0, zero
> > +    ret
> > +
> > +.Lbuild_table:
> > +    mv          a6, a0 /* store incoming a0 */
> > +    li          t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
> > +
> > +    vsetvli     zero, t1, e8, m1, tu, mu
> > +#if __riscv_v_min_vlen < 256
> > +    /* we want to build a 256-bit table, so use vlenb*2,
> > +     * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
> > +     * 'V' extension specifies a minimum vlen of 128 so this should cover
> > +     * all cases; we can skip the check if we know vlen >= 256 at compile time
> > +     */
> > +    csrr        t2, vlenb
> > +    bgeu        t2, t1, 1f
> > +    vsetvli     zero, t1, e8, m2, tu, mu
> > +1:
> > +#endif // __riscv_v_min_vlen
> > +
> > +    /* read one char from the charset at a time and write the correct bit
> > +     * in the lookup table; we could do SIMD iff we ever get an extension
> > +     * that provides some way of scattering bytes into a reg group
> > +     */
> > +    vmv.v.x     v16, zero       /* clear out table */
> > +    vmv.v.x     v8, zero        /* clear out v8 */
> > +    li          t3, 1
> > +    vmv.s.x     v8, t3          /* v8 now all zeroes except bottom byte */
> > +
> > +1:  vmv.v.x     v2, zero        /* clear out v2 */
> > +    addi        a1, a1, 1       /* advance charset ptr */
> > +    srli        t2, t0, 3       /* divide the byte we read earlier by 8 */
> > +    vslideup.vx v2, v8, t2      /* v2 now 1 in the correct byte 0 elsewhere */
> > +    vsll.vx     v2, v2, t0      /* v2 now 1 in the correct bit, 0 elsewhere */
> > +    vor.vv      v16, v16, v2    /* or it in */
> > +    lbu         t0, 0(a1)       /* fetch next bute */
> > +    bnez        t0, 1b          /* if it's null, go round again */
> > +
> > +/*
> > + *   Table is now built in v16.
> > + *   Strategy:
> > + *   - fetch next t1 bytes from memory
> > + *   - vrgather on their values divided by 8 to get relevant bytes of table
> > + *   - shift right to get the correct bit into bit 1
> > + *   - and with 1, compare with expected terminator value, then check mask
> > + *     to see if we've found a terminator
> > + *
> > + *   Before we can begin, a0 needs to be t1-aligned, so that when we fetch
> > + *   the next t1 bytes - any of which may be the null terminator -
> > + *   we do not cross a page boundary and read unmapped memory. Therefore
> > + *   we have one read of however many bytes are needed to align a0,
> > + *   before the main loop.
> > + */
> > +
> > +.Lscan_table:
> > +    vmv.v.x     v8, t3              /* v8 now t1 bytes of 0x01 */
> > +
> > +    and         t2, a0, t1          /* mask to align to t1 */
> > +    beqz        t2, 2f              /* or skip if we're already aligned */
> > +    sub         t2, t1, t2          /* t2 now bytes to read to align to t1 */
> > +
> > +    vid.v       v2                  /* build mask instead of changing vl */
> > +    vmsltu.vx   v0, v2, t2          /* so we don't need to track LMUL */
> > +
> > +    vle8.v      v2, (a0), v0.t      /* load next bytes from input */
> > +    vsrl.vi     v4, v2, 3           /* divide by 8 */
> > +    vrgather.vv v6, v16, v4         /* corresponding bytes of bit table */
> > +    vsrl.vv     v6, v6, v2          /* shift correct bits to lsb */
> > +    vand.vv     v6, v6, v8          /* and with 1 to complete the lookups */
> > +    vmseq.vx    v4, v6, zero, v0.t  /* check to see if any 0s are present */
> > +    vfirst.m    t0, v4              /* index of the first 0, if any */
> > +    bgez        t0, .Lscan_end      /* if we found one, stop */
> > +    add         a0, a0, t2          /* advance by number of bytes we read */
> > +
> > +2:  add         a6, a6, t1     /* we'll advance a0 before the exit check */
> > +1:  vle8.v      v2, (a0)       /* as above but unmasked so t1 elts per pass */
> > +    add         a0, a0, t1
> > +
> > +    vsrl.vi     v4, v2, 3
> > +    vrgather.vv v6, v16, v4
> > +    vsrl.vv     v6, v6, v2
> > +    vand.vv     v6, v6, v8
> > +
> > +    vmseq.vx    v4, v6, zero
> > +    vfirst.m    t0, v4
> > +    bltz        t0, 1b
> > +
> > +.Lscan_end:
> > +    add         a0, a0, t0     /* calculate offset to terminating byte */
> > +    sub         a0, a0, a6
> > +    ret
> > +.size   strspn, .-strspn
> > +
> > +/* strcspn
> > + *
> > + * table build exactly as for strspn, except:
> > + * - the lookup table starts with all bits except bit 0 of byte 0 set
> > + * - we clear the corresponding bit for each byte in the charset
> > + * once table is built, we can reuse the scan code directly
> > + */
> > +
> > +strcspn:
> > +    lbu         t0, 0(a1)
> > +    beqz        t0, strlen   /* no rejections -> prefix is whole string */
> > +
> > +    mv          a6, a0
> > +    li          t1, 32
> > +
> > +    vsetvli     zero, t1, e8, m1, tu, mu
> > +#if __riscv_v_min_vlen < 256
> > +    csrr        t2, vlenb
> > +    bgeu        t2, t1, 1f
> > +    vsetvli     zero, t1, e8, m2, tu, mu
> > +1:
> > +#endif // __riscv_v_min_vlen
> > +
> > +    vmv.v.x     v8, zero
> > +    li          t3, 1           /* all bits clear except bit 0 of byte 0 */
> > +    vmv.s.x     v8, t3
> > +    vnot.v      v16, v8         /* v16 is the inverse of that */
> > +    li          t4, -1
> > +
> > +1:  vmv.v.x     v2, zero
> > +    addi        a1, a1, 1       /* advance charset ptr */
> > +    srli        t2, t0, 3       /* select correct bit in v2 */
> > +    vslideup.vx v2, v8, t2
> > +    vsll.vx     v2, v2, t0
> > +    vnot.v      v2, v2          /* invert */
> > +    vand.vv     v16, v16, v2    /* clear the relevant bit of table */
> > +    lbu         t0, 0(a1)
> > +    bnez        t0, 1b
> > +    j           .Lscan_table
> > +.size   strcspn, .-strcspn
> > +
> > +libc_hidden_builtin_def (strspn)
> > +libc_hidden_builtin_def (strcspn)
> > \ No newline at end of file
  
Andrew Waterman Feb. 1, 2023, 6:13 p.m. UTC | #8
On Wed, Feb 1, 2023 at 1:54 AM Sergei Lewis <slewis@rivosinc.com> wrote:
>
> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> targeting the riscv "V" extension, version 1.0
>
> The vectorised implementations assume VLENB of at least 128 and at least 32
> registers (as mandated by the "V" extension spec). They also assume that
> VLENB is a power of two which is no larger than the page size

Standard software shouldn't be arbitrarily imposing upper bounds on
VLEN.  (The assumption that VLEN >= 128 is valid, because that's
mandated by the V extension specification.)

There are already RISC-V vector supercomputer proposals that rub up
against this limit, or maybe even exceed it.  glibc shouldn't be the
place where we decide such implementations are nonconforming
(especially if the outcome is to unexpectedly fail at runtime).

The intended mechanism to vectorize string routines is the
fault-only-first loads.  These also result in much simpler code.  See
the sample code for strlen, for example, which doesn't need to make
assumptions about VLEN or about the page size.  There are other string
examples in the same directory.
https://github.com/riscv/riscv-v-spec/blob/6673ce8b1df3126cf250b8cbf422329f257adf08/example/strlen.s

Please do not merge this patch set as-is.

> , and (as
> vectorised code in glibc for other platforms does) that it is safe to read
> past null terminators / buffer ends provided one does not cross a page
> boundary.
>
> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
> ---
>  sysdeps/riscv/rv64/rvv/Implies     |   2 +
>  sysdeps/riscv/rv64/rvv/memchr.S    | 127 +++++++++++++++++++
>  sysdeps/riscv/rv64/rvv/memcmp.S    |  93 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/memcpy.S    | 154 +++++++++++++++++++++++
>  sysdeps/riscv/rv64/rvv/memmove.c   |  22 ++++
>  sysdeps/riscv/rv64/rvv/memset.S    |  89 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strchr.S    |  92 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strchrnul.c |  22 ++++
>  sysdeps/riscv/rv64/rvv/strcmp.S    | 108 +++++++++++++++++
>  sysdeps/riscv/rv64/rvv/strcpy.S    |  72 +++++++++++
>  sysdeps/riscv/rv64/rvv/strcspn.c   |  22 ++++
>  sysdeps/riscv/rv64/rvv/strlen.S    |  67 ++++++++++
>  sysdeps/riscv/rv64/rvv/strncmp.S   | 104 ++++++++++++++++
>  sysdeps/riscv/rv64/rvv/strncpy.S   |  96 +++++++++++++++
>  sysdeps/riscv/rv64/rvv/strnlen.S   |  81 +++++++++++++
>  sysdeps/riscv/rv64/rvv/strrchr.S   |  88 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strspn.S    | 189 +++++++++++++++++++++++++++++
>  17 files changed, 1428 insertions(+)
>  create mode 100644 sysdeps/riscv/rv64/rvv/Implies
>  create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
>
> diff --git a/sysdeps/riscv/rv64/rvv/Implies b/sysdeps/riscv/rv64/rvv/Implies
> new file mode 100644
> index 0000000000..b07b4cb906
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/Implies
> @@ -0,0 +1,2 @@
> +riscv/rv64/rvd
> +
> diff --git a/sysdeps/riscv/rv64/rvv/memchr.S b/sysdeps/riscv/rv64/rvv/memchr.S
> new file mode 100644
> index 0000000000..a7e32b8f25
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memchr.S
> @@ -0,0 +1,127 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memchr for riscv with vector extension
> + * Assumptions:
> + *    - cpu becomes bandwidth limited at or before
> + *            2 vector register sized read/write operations
> + *          + 2 scalar operations
> + *          + conditional branch
> + */
> +
> +.globl  memchr
> +.type   memchr,@function
> +
> +.align    2
> +memchr:
> +    beqz a2, .Lnot_found
> +    csrr    t1, vlenb
> +    bgeu    a2, t1, .Lvector_path   /* only use vector path if we're scanning
> +                                       at least vlenb bytes */
> +
> +#ifndef __riscv_strict_align
> +    li a3, 8
> +    blt a2, a3, .Lbytewise
> +
> +    li      t1, 0x0101010101010101
> +    slli    a4, t1, 7     /* a4 = 0x8080808080808080 */
> +    mul     t2, a1, t1    /* entirety of t2 is now repeats of target character;
> +                             assume mul is at worst no worse than 3*(shift+OR),
> +                             otherwise do that instead */
> +
> +/*
> + * strategy:
> + * t4 = ((*a0) ^ t2)
> + *      - now t4 contains zero bytes if and only if next word of memory
> + *        had target character at those positions
> + *
> + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
> + *      - all nonzero bytes of t4 become 0; zero bytes become 0x80
> + *
> + * if t4 is nonzero, find the index of the byte within it, add to a0 and return
> + * otherwise, loop
> + */
> +
> +1:
> +    ld t4, (a0)          /* t4 = load next 8 bytes */
> +    xor t4, t4, t2
> +    sub t5, t4, t1
> +    not t4, t4
> +    and t4, t5, t4
> +    and t4, t4, a4
> +    bnez t4, .Lbytewise  /* could use ctzw, mod+lookup or just binary chop
> +                            to locate byte of interest in t4 but profiling
> +                            shows these approaches are at best no better */
> +    addi a2, a2, -8
> +    addi a0, a0, 8
> +    bgeu a2, a3, 1b
> +    beqz a2, .Lnot_found
> +#endif // __riscv_strict_align
> +
> +/* too little data for a dword. mask calculation and branch mispredict costs
> +   make checking a word not worthwhile. degrade to bytewise search. */
> +
> +.Lbytewise:
> +    add t2, a0, a2
> +
> +1:
> +    lb t1, (a0)
> +    beq t1, a1, .Lfound
> +    addi a0, a0, 1
> +    blt a0, t2, 1b
> +
> +.Lnot_found:
> +    mv a0, zero
> +.Lfound:
> +    ret
> +
> +.Lvector_path:
> +    vsetvli t2, a2, e8, m2, ta, ma
> +
> +1:
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, a1
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_found
> +    add         a0, a0, t2
> +    sub         a2, a2, t2
> +    bge         a2, t2, 1b
> +    bnez        a2, 2f
> +    mv          a0, zero
> +    ret
> +
> +2:
> +    vsetvli t2, a2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, a1
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_found
> +    mv          a0, zero
> +    ret
> +
> +.Lvec_found:
> +    add a0, a0, t3
> +    ret
> +
> +.size   memchr, .-memchr
> +libc_hidden_builtin_def (memchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memcmp.S b/sysdeps/riscv/rv64/rvv/memcmp.S
> new file mode 100644
> index 0000000000..a945753a5f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memcmp.S
> @@ -0,0 +1,93 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +/* Optimised memcmp for riscv with vector extension
> + */
> +
> +.globl  memcmp
> +.type   memcmp,@function
> +
> +.align    2
> +
> +memcmp:
> +    mv t2, zero
> +    beqz a2, .Ldone
> +
> +    li          t1, 5            /* scalar path cheaper for 1-4 elts */
> +    bltu        a2, t1, .Lscalar
> +
> +    /* main loop, vlenb*2 elts at a time */
> +    vsetvli     t1, a2, e8, m2, ta, ma
> +
> +1:
> +    vle8.v      v2, (a0)        /* load elts */
> +    vle8.v      v4, (a1)
> +    vmsne.vv    v0, v2, v4      /* compare */
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_diff  /* found a difference ? */
> +    add         a0, a0, t1      /* not yet, advance everything */
> +    add         a1, a1, t1
> +    sub         a2, a2, t1
> +    bgeu        a2, t1, 1b
> +
> +    bnez        a2, .Ltail
> +    mv          a0, zero
> +    ret
> +
> +.Ltail:
> +    /* handle tail. we know a2 < vlenb*2 so just load and compare the lot */
> +    vsetvli     t1, a2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vle8.v      v4, (a1)
> +    vmsne.vv    v0, v2, v4
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_diff
> +    mv          a0, zero         /* no diff found */
> +    ret
> +
> +.Lvec_diff:         /* v2, v4 differ at elt t3 */
> +    add a0, a0, t3
> +    add a1, a1, t3
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    sub a0, t0, t1
> +    ret
> +
> +.Lscalar:
> +    add  t3, a0, a2
> +
> +1:
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    sub t2, t0, t1
> +    bnez t2, .Ldone
> +    addi a0, a0, 1
> +    addi a1, a1, 1
> +    bltu a0, t3, 1b
> +
> +.Ldone:
> +    mv a0, t2
> +    ret
> +
> +
> +.size memcmp, .-memcmp
> +libc_hidden_builtin_def (memcmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memcpy.S b/sysdeps/riscv/rv64/rvv/memcpy.S
> new file mode 100644
> index 0000000000..7b37ec285d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memcpy.S
> @@ -0,0 +1,154 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +/* Optimised memcpy and memmove for riscv with vector extension
> + */
> +
> +.globl  memcpy
> +.type   memcpy,@function
> +.globl  memmove
> +.type   memmove,@function
> +
> +.align    2
> +memmove:
> +    bge     a0, a1, .Lmemcpy_rev
> +
> +memcpy:
> +.Lmemcpy_fwd:
> +    mv      t0, a0          /* t0 = preserve a0 so we can return it */
> +    csrr    t2, vlenb       /* t2 = number of bytes per vectorised copy op */
> +    slli    t5, t2,  1      /* t5 = number of bytes per loop */
> +    addi    t3, t5, -1      /* generate mask */
> +    not     t4, t3
> +    and     t4, a2, t4      /* t4 = bytes copied in vectorised pass */
> +
> +    beqz    t4, .Lscalar_fwd    /* size too small for even one pass? */
> +
> +    and    a2, a2, t3           /* a2 = bytes still left to copy after pass */
> +    add    t4, t4, a1           /* t4 = src at end of vectorised pass */
> +
> +1:
> +    vl2r.v  v2, (a1)            /* load, advance source */
> +    add     a1, a1, t5
> +    vs2r.v  v2, (t0)            /* store, advance dest */
> +    add     t0, t0, t5
> +    bltu    a1, t4, 1b          /* src at end? */
> +
> +    bltu    a2, t2, .Lscalar_fwd /* should we do one more vec load/store? */
> +    vl1r.v  v2, (a1)
> +    sub     a2, a2, t2
> +    add     a1, a1, t2
> +    vs1r.v  v2, (t0)
> +    add     t0, t0, t2
> +
> +.Lscalar_fwd:
> +    bnez    a2, .Lnobail
> +.Lbail:
> +    ret
> +.Lnobail:
> +
> +#ifndef __riscv_strict_align
> +    addi    t2, zero, 4
> +    bltu    a2, t2, .Lsingle_bytes
> +1:
> +    lw      t3, 0(a1)
> +    addi    a1, a1, 4
> +    sw      t3, 0(t0)
> +    addi    t0, t0, 4
> +    addi    a2, a2, -4
> +    bgeu    a2, t2, 1b
> +#endif // __riscv_strict_align
> +
> +.Lsingle_bytes:
> +    beqz    a2, .Lbail
> +    add     a2, a2, a1          /* a2 = src + remaining size */
> +1:
> +    lb      t1, 0(a1)
> +    sb      t1, 0(t0)
> +    addi    a1, a1, 1
> +    addi    t0, t0, 1
> +    bltu    a1, a2, 1b
> +    ret
> +.size   memcpy,     .-memcpy
> +
> +
> +.Lmemcpy_rev:
> +    beq     a0, a1, .Lmemcpy_rev_done
> +    add     t0, a0, a2          /* t0 = dest so we can return a0=dest later */
> +    add     t6, a1, a2          /* dest and src both point to byte */
> +                                /* immediately after end of buffer */
> +
> +    csrr    t2, vlenb           /* t2 = number of bytes per pass */
> +    slli    t5, t2,  1          /* t5 = number of bytes per entire loop */
> +    addi    t3, t5, -1          /* t3 = (bytes per loop) mask */
> +    not     t4, t3              /* generate mask for bytes processed by loop */
> +    and     t4, a2, t4          /* t4 = bytes copied in vectorised pass */
> +
> +    beqz    t4, .Lscalar_rev    /* size too small for even one pass? */
> +
> +    and    a2, a2, t3           /* a2 = bytes still left to copy after pass */
> +    sub    t4, t6, t4           /* t4 = src at end of vectorised pass */
> +
> +1:
> +    sub     t6, t6, t5
> +    sub     t0, t0, t5
> +    vl2r.v  v2, (t6)            /* load, advance source */
> +    vs2r.v  v2, (t0)            /* store, advance dest */
> +    bgtu    t6, t4, 1b          /* src at end? */
> +
> +    bltu    a2, t2, .Lscalar_rev /* should we do one more vec load/store? */
> +    sub     t6, t6, t2
> +    sub     t0, t0, t2
> +    sub     a2, a2, t2
> +    vl1r.v  v2, (t6)
> +    vs1r.v  v2, (t0)
> +
> +.Lscalar_rev:
> +#ifndef __riscv_strict_align
> +    beqz    a2, .Lbail
> +
> +    addi    t2, zero, 4
> +    bltu    a2, t2, 2f
> +1:
> +    addi    t6, t6, -4
> +    addi    t0, t0, -4
> +    addi    a2, a2, -4
> +    lw      t3, 0(t6)
> +    sw      t3, 0(t0)
> +    bgeu    a2, t2, 1b
> +2:
> +#endif // __riscv_strict_align
> +
> +    beqz    a2, .Lbail
> +1:
> +    addi    t6, t6, -1
> +    addi    t0, t0, -1
> +    lb      t1, 0(t6)
> +    sb      t1, 0(t0)
> +    bgtu    t0, a0, 1b
> +
> +.Lmemcpy_rev_done:
> +    ret
> +
> +.size   memmove, .-memmove
> +libc_hidden_builtin_def (memcpy)
> +libc_hidden_builtin_def (memmove)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memmove.c b/sysdeps/riscv/rv64/rvv/memmove.c
> new file mode 100644
> index 0000000000..47734854f9
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memmove.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* memmove is implemented in memcpy.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/memset.S b/sysdeps/riscv/rv64/rvv/memset.S
> new file mode 100644
> index 0000000000..6f82c542b1
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memset.S
> @@ -0,0 +1,89 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memset for riscv with vector extension
> + */
> +
> +.globl  memset
> +.type   memset,@function
> +
> +.align    2
> +memset:
> +    mv      t0, a0                  /* t0 = dest so we can return a0 later */
> +    vsetvli t2, a2, e8, m2, ta, ma  /* t2 = elts per copy */
> +    beqz    t2, .Lscalar
> +
> +    vmv.v.x v2, a1                  /* splat value across v2 */
> +
> +    slli    t3, t2, 1
> +    bgtu    t3, a2, .Lsinglestore
> +
> +1:
> +    vse8.v  v2, (t0)
> +    add     t0, t0, t2
> +    vse8.v  v2, (t0)
> +    add     t0, t0, t2
> +    sub     a2, a2, t3
> +    bgeu    a2, t3, 1b
> +    bgeu    a2, t2, .Lsinglestore
> +    bnez    a2, .Lscalar
> +
> +.Lbail:
> +    ret
> +
> +.Lsinglestore:
> +    bgtu    t2, a2, .Lscalar
> +    vse8.v  v2, (t0)
> +    add     t0, t0, t2
> +    sub     a2, a2, t2
> +
> +.Lscalar:
> +    beqz    a2, .Lbail
> +
> +#ifndef __riscv_strict_align
> +    slli    t2, a1, 8
> +    or      a1, a1, t2
> +    slli    t2, a1, 16
> +    or      a1, a1, t2
> +
> +    addi    t2, zero, 4
> +    bltu    a2, t2, 2f
> +
> +1:
> +    sw      a1, 0(t0)
> +    addi    t0, t0, 4
> +    addi    a2, a2, -4
> +    bgeu    a2, t2, 1b
> +2:
> +    beqz    a2, .Lbail
> +#endif // __riscv_strict_align
> +
> +    add     a2, a2, t0
> +1:
> +    sb      a1, 0(t0)
> +    addi    t0, t0, 1
> +    bltu    t0, a2, 1b
> +    ret
> +
> +.size   memset,     .-memset
> +libc_hidden_builtin_def (memset)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strchr.S b/sysdeps/riscv/rv64/rvv/strchr.S
> new file mode 100644
> index 0000000000..0b37174c55
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strchr.S
> @@ -0,0 +1,92 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strchr
> +.type   strchr,@function
> +
> +.globl  __strchrnul
> +.type   __strchrnul,@function
> +
> +/*
> + *  optimized strchr for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +__strchrnul:
> +    li t5, -1
> +    j  1f
> +
> +strchr:
> +    mv          t5, zero
> +1:  csrr        t1, vlenb              /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1             /* mask off unaligned part of pointer */
> +    and         t2, a0, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2             /* search however many bytes
> +                                          are needed to align the pointer */
> +    vsetvli     t2, t2, e8, m2, ta, mu
> +
> +    vle8.v      v2, (a0)               /* load data into v2(,v3) */
> +    vmseq.vx    v4, v2, zero
> +    vfirst.m    t4, v4
> +    vmsbf.m     v0, v4
> +    vmseq.vx    v0, v2, a1, v0.t
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    bgez        t4, .Lbufferend
> +    add         a0, a0, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu
> +    li          t4, -1
> +
> +1:
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v4, v2, zero
> +    vfirst.m    t4, v4
> +    vmsbf.m     v0, v4
> +    vmseq.vx    v0, v2, a1, v0.t
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    bgez        t4, .Lbufferend
> +    add         a0, a0, t1
> +    j           1b
> +
> +.Lfound:                                    /* found the target at a0+t3 */
> +    add         a0, a0, t3
> +    ret
> +
> +.Lbufferend:
> +    add         a0, a0, t4
> +    and         a0, a0, t5
> +    ret
> +
> +.size   strchr, .-strchr
> +.size   __strchrnul, .-__strchrnul
> +
> +libc_hidden_builtin_def (strchr)
> +weak_alias (__strchrnul, strchrnul)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strchrnul.c b/sysdeps/riscv/rv64/rvv/strchrnul.c
> new file mode 100644
> index 0000000000..259da80358
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strchrnul.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* strchrnul is implemented in strchr.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strcmp.S b/sysdeps/riscv/rv64/rvv/strcmp.S
> new file mode 100644
> index 0000000000..4a219221ac
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcmp.S
> @@ -0,0 +1,108 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strcmp
> +.type   strcmp,@function
> +
> +.align    2
> +
> +/* most of the time, one or both sides is unaligned and their alignments differ
> + * we need to check for a null terminator before crossing a page boundary
> + * strategy:
> + * - for each side, calculate masks for alignment and (vlenb * 2) - alignment
> + * - while no difference encountered:
> + * -   for each side:
> + * -       load bytes to end of next vlenb*2 block
> + * -       check for null terminator
> + * -       if no terminator, load bytes to fill rest of register
> + * -   compare sides
> + */
> +
> +strcmp:
> +    csrr        t1, vlenb                   /* find vlenb*2 */
> +    add         t1, t1, t1
> +    vsetvli     zero, t1, e8, m2, ta, mu
> +    vid.v       v30
> +    addi        t2, t1, -1         /* mask for unaligned part of ptr */
> +    and         t6, a0, t2         /* unaligned part of lhs */
> +    and         t5, a1, t2         /* unaligned part of rhs */
> +    sub         t6, t1, t6         /* safe number of lhs bytes to read */
> +    sub         t5, t1, t5         /* same, rhs */
> +    vmsltu.vx   v28, v30, t6       /* v28 = mask for first half of lhs load */
> +    vmsltu.vx   v26, v30, t5       /* v26 = mask for first half of rhs load */
> +    vmv.v.x     v16, zero
> +    vmv.v.x     v18, zero
> +
> +1:  vmv.v.v     v0, v28              /* lhs mask */
> +    vle8.v      v2, (a0), v0.t       /* masked load from lhs */
> +    vmseq.vx    v16, v2, zero, v0.t  /* check loaded bytes for null */
> +    vmv.v.v     v0, v26              /*       rhs mask */
> +    vfirst.m    t2, v16              /* get lhs check result */
> +    bgez        t2, .Ltail           /* bail if we can't safely check rest */
> +    vle8.v      v4, (a1), v0.t       /*    masked load from rhs */
> +    vmseq.vx    v18, v4, zero, v0.t  /*    check partial rhs for null */
> +    vmnot.m     v0, v28              /* mask for rest of lhs */
> +    vfirst.m    t3, v18              /* get check result */
> +    bltz        t3, 2f               /* test it */
> +                                     /* we see null terminator */
> +    bge         t3, t6, .Ltail       /* have enough bytes for vector cmp? */
> +
> +    vmsleu.vx   v0, v30, t3          /* select rest + null */
> +    vmsne.vv    v0, v2, v4, v0.t     /* compare */
> +    vfirst.m    t3, v0
> +    bgez        t3, 3f
> +    mv          a0, zero             /* no difference */
> +    ret
> +3:  add a0, a0, t3
> +    add a1, a1, t3
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +.Ldiff:
> +    sub a0, t0, t1
> +    ret
> +
> +    /* ...no null terminator */
> +2:  vle8.v      v2, (a0), v0.t       /* load rest of lhs */
> +    vmnot.m     v0, v26              /* mask for rest of rhs */
> +    vle8.v      v4, (a1), v0.t       /* load rest of rhs */
> +    vmsne.vv    v0, v2, v4           /* compare */
> +    add         a0, a0, t1           /* advance ptrs */
> +    vfirst.m    t3, v0
> +    add         a1, a1, t1
> +    bltz        t3, 1b
> +
> +    sub t3, t3, t1 /* found difference but we've already advanced a0 and a1 */
> +    j 3b
> +
> +.Ltail:
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    bne t0, t1, .Ldiff
> +    addi a0, a0, 1
> +    addi a1, a1, 1
> +    bnez t0, .Ltail
> +    mv a0, zero
> +    ret
> +
> +
> +.size strcmp, .-strcmp
> +libc_hidden_builtin_def (strcmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S b/sysdeps/riscv/rv64/rvv/strcpy.S
> new file mode 100644
> index 0000000000..b21909d66f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
> @@ -0,0 +1,72 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strcpy
> +.type   strcpy,@function
> +
> +/*
> + *  optimized strcpy for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strcpy:
> +    mv          t0, a0                     /* copy dest so we can return it */
> +
> +    csrr        t1, vlenb                  /* find vlenb*2 */
> +    add         t1, t1, t1
> +
> +    addi        t2, t1, -1                 /* mask unaligned part of ptr */
> +    and         t2, a1, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                 /* search enough to align ptr */
> +    vsetvli     t2, t2, e8, m2, tu, mu
> +    vle8.v      v2, (a1)
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4                     /* copy but not past null */
> +    vfirst.m    t3, v4
> +    vse8.v      v2, (t0), v0.t
> +    bgez        t3, .Ldone
> +    add         t0, t0, t2
> +    add         a1, a1, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu   /* now do 2*vlenb bytes per pass */
> +
> +1:
> +    vle8.v      v2, (a1)
> +    add         a1, a1, t1
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4
> +    vfirst.m    t3, v4
> +    vse8.v      v2, (t0), v0.t
> +    add         t0, t0, t1
> +    bltz        t3, 1b
> +
> +.Ldone:
> +    ret
> +
> +.size   strcpy, .-strcpy
> +libc_hidden_builtin_def (strcpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c b/sysdeps/riscv/rv64/rvv/strcspn.c
> new file mode 100644
> index 0000000000..f0595a72fb
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* strcspn is implemented in strspn.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strlen.S b/sysdeps/riscv/rv64/rvv/strlen.S
> new file mode 100644
> index 0000000000..c77d500693
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strlen.S
> @@ -0,0 +1,67 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strlen
> +.type   strlen,@function
> +
> +/*
> + *  optimized strlen for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strlen:
> +    mv          t4, a0                    /* copy of buffer start */
> +    csrr        t1, vlenb                 /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1                /* mask off unaligned part of ptr */
> +    and         t2, a0, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                /* search fwd to align ptr */
> +    vsetvli     t2, t2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, ma  /* search 2*vlenb bytes per pass */
> +    add         t4, t4, t1
> +
> +1:
> +    vle8.v      v2, (a0)
> +    add         a0, a0, t1
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bltz        t3, 1b
> +
> +.Lfound:                                  /* found the 0; subtract          */
> +    sub         a0, a0, t4                /* buffer start from current ptr  */
> +    add         a0, a0, t3                /* and add offset into fetched    */
> +    ret                                   /* data to get length */
> +
> +.size   strlen, .-strlen
> +libc_hidden_builtin_def (strlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S b/sysdeps/riscv/rv64/rvv/strncmp.S
> new file mode 100644
> index 0000000000..863e5cb525
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
> @@ -0,0 +1,104 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strncmp
> +.type   strncmp,@function
> +
> +.align    2
> +
> +/* as strcmp, but with added checks on a2 (max count)
> + */
> +
> +strncmp:
> +    csrr        t1, vlenb                   /* find vlenb*2 */
> +    add         t1, t1, t1
> +    blt         a2, t1, .Ltail              /* degrade if max < vlenb*2 */
> +    vsetvli     zero, t1, e8, m2, ta, mu
> +    vid.v       v30
> +    addi        t2, t1, -1                  /* mask unaligned part of ptr */
> +    and         t6, a0, t2                  /* unaligned part of lhs */
> +    and         t5, a1, t2                  /* unaligned part of rhs */
> +    sub         t6, t1, t6                  /* safe count to read from lhs */
> +    sub         t5, t1, t5                  /* same, rhs */
> +    vmsltu.vx   v28, v30, t6                /* mask for first part of lhs */
> +    vmsltu.vx   v26, v30, t5                /* mask for first part of rhs */
> +    vmv.v.x     v16, zero
> +    vmv.v.x     v18, zero
> +
> +
> +1:  blt         a2, t1, .Ltail
> +    vmv.v.v     v0, v28                     /* lhs mask */
> +    vle8.v      v2, (a0), v0.t              /* masked load from lhs */
> +    vmseq.vx    v16, v2, zero, v0.t         /* check loaded bytes for null */
> +    vmv.v.v     v0, v26                     /*       rhs mask */
> +    vfirst.m    t2, v16                     /* get lhs check result */
> +    bgez        t2, .Ltail                  /* can we safely check rest */
> +    vle8.v      v4, (a1), v0.t              /*       masked load from rhs */
> +    vmseq.vx    v18, v4, zero, v0.t         /*       check partial rhs */
> +    vmnot.m     v0, v28                     /* mask for rest of lhs */
> +    vfirst.m    t3, v18                     /* get check result */
> +    bltz        t3, 2f                      /* test it */
> +    bge         t3, t6, .Ltail
> +
> +    vmsleu.vx   v0, v30, t3                 /* select rest of string + null */
> +    vmsne.vv    v0, v2, v4, v0.t            /* compare */
> +    vfirst.m    t3, v0
> +    bgez        t3, 3f
> +    mv          a0, zero
> +    ret
> +3:  add a0, a0, t3
> +    add a1, a1, t3
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +.Ldiff:
> +    sub a0, t0, t1
> +    ret
> +
> +    /* ...no null terminator in first part of lhs or rhs */
> +2:  vle8.v      v2, (a0), v0.t              /* load rest of lhs */
> +    vmnot.m     v0, v26                     /* mask for rest of rhs */
> +    vle8.v      v4, (a1), v0.t              /* load rest of rhs */
> +    vmsne.vv    v0, v2, v4                  /* compare */
> +    add         a0, a0, t1                  /* advance ptrs */
> +    vfirst.m    t3, v0
> +    add         a1, a1, t1
> +    sub         a2, a2, t1
> +    bltz        t3, 1b
> +
> +    sub t3, t3, t1  /* found a diff but we've already advanced a0 and a1 */
> +    j 3b
> +
> +.Ltail:
> +    beqz a2, 1f
> +    addi a2, a2, -1
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    bne t0, t1, .Ldiff
> +    addi a0, a0, 1
> +    addi a1, a1, 1
> +    bnez t0, .Ltail
> +1:  mv a0, zero
> +    ret
> +
> +
> +.size strncmp, .-strncmp
> +libc_hidden_builtin_def (strncmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S b/sysdeps/riscv/rv64/rvv/strncpy.S
> new file mode 100644
> index 0000000000..8b3a1e545c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
> @@ -0,0 +1,96 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strncpy
> +.type   strncpy,@function
> +
> +/*
> + *  optimized strcpy for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strncpy:
> +    mv          t0, a0                    /* need to return dest so copy */
> +
> +    csrr        t1, vlenb                 /* find vlenb*2 */
> +    add         t1, t1, t1
> +
> +    addi        t2, t1, -1                /* mask off unaligned part of ptr */
> +    and         t2, a1, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                /* search to align the pointer */
> +    vsetvli     zero, t2, e8, m2, tu, mu
> +    vle8.v      v2, (a1)
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4                    /* copy to dest */
> +    vfirst.m    t3, v4
> +    bgeu        t2, a2, .Ldest_full
> +    vse8.v      v2, (t0), v0.t
> +    bgez        t3, .Lterminator_found
> +    add         t0, t0, t2
> +    add         a1, a1, t2
> +    sub         a2, a2, t2
> +    beqz        a2, .Ldone
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:
> +    vle8.v      v2, (a1)
> +    add         a1, a1, t1
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4
> +    vfirst.m    t3, v4
> +    bgeu        t1, a2, .Ldest_full
> +    vse8.v      v2, (t0), v0.t
> +    add         t0, t0, t1
> +    sub         a2, a2, t1
> +    bltz        t3, 1b
> +    sub         t0, t0, t1
> +
> +.Lterminator_found:
> +    addi        sp, sp, -16
> +    sd          ra, 0(sp)
> +    sd          a0, 8(sp)
> +    add         a0, t0, t3
> +    mv          a1, zero
> +    sub         a2, a2, t3
> +    jal         ra, memset
> +    ld          ra, 0(sp)
> +    ld          a0, 8(sp)
> +    addi        sp, sp, 16
> +.Ldone:
> +    ret
> +
> +.Ldest_full:
> +    vid.v       v6
> +    vmsltu.vx   v4, v6, a2
> +    vmand.mm    v0, v0, v4
> +    vse8.v      v2, (t0), v0.t
> +    ret
> +
> +.size   strncpy, .-strncpy
> +libc_hidden_builtin_def (strncpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S b/sysdeps/riscv/rv64/rvv/strnlen.S
> new file mode 100644
> index 0000000000..6d7ee65c7a
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
> @@ -0,0 +1,81 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  __strnlen
> +.type   __strnlen,@function
> +
> +/* vector optimized strnlen
> + * assume it's safe to read to the end of the page
> + * containing either a null terminator or the last byte of the count or both,
> + * but not past it
> + * assume page size >= vlenb*2
> + */
> +
> +.align    2
> +__strnlen:
> +    mv          t4, a0               /* stash a copy of start for later */
> +    beqz        a1, .LzeroCount
> +
> +    csrr        t1, vlenb            /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1           /* mask off unaligned part of ptr */
> +    and         t2, a1, a0
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2           /* search to align pointer to t1 */
> +    bgeu        t2, a1, 2f           /* check it's safe */
> +    mv          t2, a1               /* it's not! look as far as permitted */
> +2:  vsetvli     t2, t2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t2
> +    sub         a1, a1, t2
> +    bltu        a1, t1, .LreachedCount
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, ma    /* do 2*vlenb bytes per pass */
> +
> +1:  vle8.v      v2, (a0)
> +    sub         a1, a1, t1
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t1
> +    bgeu        a1, t1, 1b
> +.LreachedCount:
> +    mv          t2, a1    /* in case 0 < a1 < t1 */
> +    bnez        a1, 2b    /* if so, still t2 bytes to check, all safe */
> +.LzeroCount:
> +    sub         a0, a0, t4
> +    ret
> +
> +.Lfound:        /* found the 0; subtract buffer start from current pointer */
> +    add         a0, a0, t3 /* and add offset into fetched data */
> +    sub         a0, a0, t4
> +    ret
> +
> +.size   __strnlen, .-__strnlen
> +weak_alias (__strnlen, strnlen)
> +libc_hidden_builtin_def (__strnlen)
> +libc_hidden_builtin_def (strnlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S b/sysdeps/riscv/rv64/rvv/strrchr.S
> new file mode 100644
> index 0000000000..4bef8a3b9c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
> @@ -0,0 +1,88 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strrchr
> +.type   strrchr,@function
> +
> +/*
> + *  optimized strrchr for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strrchr:
> +    mv          t5, a0    /* stash buffer ptr somewhere safe */
> +    mv          a0, zero  /* result is nullptr unless we find better below */
> +
> +    csrr        t1, vlenb                /* determine vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1               /* mask off unaligned part of ptr */
> +    and         t2, t5, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2               /* search to align ptr to 2*vlenb */
> +    vsetvli     t2, t2, e8, m2, ta, mu
> +
> +    vle8.v      v2, (t5)                 /* load data into v2(,v3) */
> +    vmseq.vx    v4, v2, zero             /* check for null terminator */
> +    vfirst.m    t4, v4                   /* grab its position, if any */
> +    vmsbf.m     v0, v4                   /* select valid chars */
> +    vmseq.vx    v0, v2, a1, v0.t         /* search for candidate byte */
> +    vfirst.m    t3, v0                   /* grab its position, if any */
> +    bltz        t3, 2f                   /* did we find a candidate? */
> +
> +3:  add         a0, t3, t5               /* we did! grab the address */
> +    vmsof.m     v1, v0                   /* there might be more than one */
> +    vmandn.mm   v0, v0, v1               /* so clear the one we just found */
> +    vfirst.m    t3, v0                   /* is there another? */
> +    bgez        t3, 3b
> +
> +2:  bgez        t4, .Ldone               /* did we see a null terminator? */
> +    add         t5, t5, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:  vle8.v      v2, (t5)
> +    vmseq.vx    v4, v2, zero
> +    vfirst.m    t4, v4
> +    vmsbf.m     v0, v4
> +    vmseq.vx    v0, v2, a1, v0.t
> +    vfirst.m    t3, v0
> +    bltz        t3, 2f
> +
> +3:  add         a0, t3, t5
> +    vmsof.m     v1, v0
> +    vmandn.mm   v0, v0, v1
> +    vfirst.m    t3, v0
> +    bgez        t3, 3b
> +
> +2:  add         t5, t5, t1
> +    bltz        t4, 1b
> +
> +.Ldone:
> +    ret
> +
> +.size   strrchr, .-strrchr
> +libc_hidden_builtin_def (strrchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strspn.S b/sysdeps/riscv/rv64/rvv/strspn.S
> new file mode 100644
> index 0000000000..2b9af5cc2d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strspn.S
> @@ -0,0 +1,189 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strspn
> +.type   strspn,@function
> +
> +.globl  strcspn
> +.type   strcspn,@function
> +
> +/*
> + *  optimized strspn / strcspn for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 32
> + *  strategy:
> + *  - build a 256-bit table on the stack, where each elt is zero
> + *    if encountering it should terminate computation and nonzero otherwise
> + *  - use vectorised lookups into this to check 2*vlen elts at a time;
> + *    this code is identical for strspan and strcspan and can be shared
> + *
> + *  note that while V mandates at least 128 bit wide regs,
> + *  we are building a 256 bit lookup table
> + *  therefore we use either LMUL=1 or 2 depending on what the target supports
> + *  therefore we only use even vector register numbers,
> + *  so everything still works if we go with LMUL=2
> + */
> +
> +# -----------------------------
> +
> +.align    2
> +
> +strspn:
> +    lbu         t0, 0(a1)
> +    bnez        t0, .Lbuild_table
> +    mv          a0, zero
> +    ret
> +
> +.Lbuild_table:
> +    mv          a6, a0 /* store incoming a0 */
> +    li          t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
> +
> +    vsetvli     zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> +    /* we want to build a 256-bit table, so use vlenb*2,
> +     * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
> +     * 'V' extension specifies a minimum vlen of 128 so this should cover
> +     * all cases; we can skip the check if we know vlen >= 256 at compile time
> +     */
> +    csrr        t2, vlenb
> +    bgeu        t2, t1, 1f
> +    vsetvli     zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> +    /* read one char from the charset at a time and write the correct bit
> +     * in the lookup table; we could do SIMD iff we ever get an extension
> +     * that provides some way of scattering bytes into a reg group
> +     */
> +    vmv.v.x     v16, zero       /* clear out table */
> +    vmv.v.x     v8, zero        /* clear out v8 */
> +    li          t3, 1
> +    vmv.s.x     v8, t3          /* v8 now all zeroes except bottom byte */
> +
> +1:  vmv.v.x     v2, zero        /* clear out v2 */
> +    addi        a1, a1, 1       /* advance charset ptr */
> +    srli        t2, t0, 3       /* divide the byte we read earlier by 8 */
> +    vslideup.vx v2, v8, t2      /* v2 now 1 in the correct byte 0 elsewhere */
> +    vsll.vx     v2, v2, t0      /* v2 now 1 in the correct bit, 0 elsewhere */
> +    vor.vv      v16, v16, v2    /* or it in */
> +    lbu         t0, 0(a1)       /* fetch next bute */
> +    bnez        t0, 1b          /* if it's null, go round again */
> +
> +/*
> + *   Table is now built in v16.
> + *   Strategy:
> + *   - fetch next t1 bytes from memory
> + *   - vrgather on their values divided by 8 to get relevant bytes of table
> + *   - shift right to get the correct bit into bit 1
> + *   - and with 1, compare with expected terminator value, then check mask
> + *     to see if we've found a terminator
> + *
> + *   Before we can begin, a0 needs to be t1-aligned, so that when we fetch
> + *   the next t1 bytes - any of which may be the null terminator -
> + *   we do not cross a page boundary and read unmapped memory. Therefore
> + *   we have one read of however many bytes are needed to align a0,
> + *   before the main loop.
> + */
> +
> +.Lscan_table:
> +    vmv.v.x     v8, t3              /* v8 now t1 bytes of 0x01 */
> +
> +    and         t2, a0, t1          /* mask to align to t1 */
> +    beqz        t2, 2f              /* or skip if we're already aligned */
> +    sub         t2, t1, t2          /* t2 now bytes to read to align to t1 */
> +
> +    vid.v       v2                  /* build mask instead of changing vl */
> +    vmsltu.vx   v0, v2, t2          /* so we don't need to track LMUL */
> +
> +    vle8.v      v2, (a0), v0.t      /* load next bytes from input */
> +    vsrl.vi     v4, v2, 3           /* divide by 8 */
> +    vrgather.vv v6, v16, v4         /* corresponding bytes of bit table */
> +    vsrl.vv     v6, v6, v2          /* shift correct bits to lsb */
> +    vand.vv     v6, v6, v8          /* and with 1 to complete the lookups */
> +    vmseq.vx    v4, v6, zero, v0.t  /* check to see if any 0s are present */
> +    vfirst.m    t0, v4              /* index of the first 0, if any */
> +    bgez        t0, .Lscan_end      /* if we found one, stop */
> +    add         a0, a0, t2          /* advance by number of bytes we read */
> +
> +2:  add         a6, a6, t1     /* we'll advance a0 before the exit check */
> +1:  vle8.v      v2, (a0)       /* as above but unmasked so t1 elts per pass */
> +    add         a0, a0, t1
> +
> +    vsrl.vi     v4, v2, 3
> +    vrgather.vv v6, v16, v4
> +    vsrl.vv     v6, v6, v2
> +    vand.vv     v6, v6, v8
> +
> +    vmseq.vx    v4, v6, zero
> +    vfirst.m    t0, v4
> +    bltz        t0, 1b
> +
> +.Lscan_end:
> +    add         a0, a0, t0     /* calculate offset to terminating byte */
> +    sub         a0, a0, a6
> +    ret
> +.size   strspn, .-strspn
> +
> +/* strcspn
> + *
> + * table build exactly as for strspn, except:
> + * - the lookup table starts with all bits except bit 0 of byte 0 set
> + * - we clear the corresponding bit for each byte in the charset
> + * once table is built, we can reuse the scan code directly
> + */
> +
> +strcspn:
> +    lbu         t0, 0(a1)
> +    beqz        t0, strlen   /* no rejections -> prefix is whole string */
> +
> +    mv          a6, a0
> +    li          t1, 32
> +
> +    vsetvli     zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> +    csrr        t2, vlenb
> +    bgeu        t2, t1, 1f
> +    vsetvli     zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> +    vmv.v.x     v8, zero
> +    li          t3, 1           /* all bits clear except bit 0 of byte 0 */
> +    vmv.s.x     v8, t3
> +    vnot.v      v16, v8         /* v16 is the inverse of that */
> +    li          t4, -1
> +
> +1:  vmv.v.x     v2, zero
> +    addi        a1, a1, 1       /* advance charset ptr */
> +    srli        t2, t0, 3       /* select correct bit in v2 */
> +    vslideup.vx v2, v8, t2
> +    vsll.vx     v2, v2, t0
> +    vnot.v      v2, v2          /* invert */
> +    vand.vv     v16, v16, v2    /* clear the relevant bit of table */
> +    lbu         t0, 0(a1)
> +    bnez        t0, 1b
> +    j           .Lscan_table
> +.size   strcspn, .-strcspn
> +
> +libc_hidden_builtin_def (strspn)
> +libc_hidden_builtin_def (strcspn)
> \ No newline at end of file
> --
> 2.34.1
>
  
Andrew Waterman Feb. 1, 2023, 7:03 p.m. UTC | #9
On Wed, Feb 1, 2023 at 1:54 AM Sergei Lewis <slewis@rivosinc.com> wrote:
>
> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> targeting the riscv "V" extension, version 1.0
>
> The vectorised implementations assume VLENB of at least 128 and at least 32
> registers (as mandated by the "V" extension spec). They also assume that
> VLENB is a power of two which is no larger than the page size, and (as
> vectorised code in glibc for other platforms does) that it is safe to read
> past null terminators / buffer ends provided one does not cross a page
> boundary.
>
> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
> ---
>  sysdeps/riscv/rv64/rvv/Implies     |   2 +
>  sysdeps/riscv/rv64/rvv/memchr.S    | 127 +++++++++++++++++++
>  sysdeps/riscv/rv64/rvv/memcmp.S    |  93 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/memcpy.S    | 154 +++++++++++++++++++++++
>  sysdeps/riscv/rv64/rvv/memmove.c   |  22 ++++
>  sysdeps/riscv/rv64/rvv/memset.S    |  89 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strchr.S    |  92 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strchrnul.c |  22 ++++
>  sysdeps/riscv/rv64/rvv/strcmp.S    | 108 +++++++++++++++++
>  sysdeps/riscv/rv64/rvv/strcpy.S    |  72 +++++++++++
>  sysdeps/riscv/rv64/rvv/strcspn.c   |  22 ++++
>  sysdeps/riscv/rv64/rvv/strlen.S    |  67 ++++++++++
>  sysdeps/riscv/rv64/rvv/strncmp.S   | 104 ++++++++++++++++
>  sysdeps/riscv/rv64/rvv/strncpy.S   |  96 +++++++++++++++
>  sysdeps/riscv/rv64/rvv/strnlen.S   |  81 +++++++++++++
>  sysdeps/riscv/rv64/rvv/strrchr.S   |  88 ++++++++++++++
>  sysdeps/riscv/rv64/rvv/strspn.S    | 189 +++++++++++++++++++++++++++++
>  17 files changed, 1428 insertions(+)
>  create mode 100644 sysdeps/riscv/rv64/rvv/Implies
>  create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
>  create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
>  create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
>
> diff --git a/sysdeps/riscv/rv64/rvv/Implies b/sysdeps/riscv/rv64/rvv/Implies
> new file mode 100644
> index 0000000000..b07b4cb906
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/Implies
> @@ -0,0 +1,2 @@
> +riscv/rv64/rvd
> +
> diff --git a/sysdeps/riscv/rv64/rvv/memchr.S b/sysdeps/riscv/rv64/rvv/memchr.S
> new file mode 100644
> index 0000000000..a7e32b8f25
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memchr.S
> @@ -0,0 +1,127 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memchr for riscv with vector extension
> + * Assumptions:
> + *    - cpu becomes bandwidth limited at or before
> + *            2 vector register sized read/write operations
> + *          + 2 scalar operations
> + *          + conditional branch
> + */
> +
> +.globl  memchr
> +.type   memchr,@function
> +
> +.align    2
> +memchr:
> +    beqz a2, .Lnot_found
> +    csrr    t1, vlenb
> +    bgeu    a2, t1, .Lvector_path   /* only use vector path if we're scanning
> +                                       at least vlenb bytes */
> +
> +#ifndef __riscv_strict_align

strict-align is not the right thing to check here.  As the RVA profile
document explains, all RVA-compliant implementations must support
misaligned loads and stores (so strict-align will be false), but they
might execute extremely slowly (e.g., via trap and emulate), and so
this approach will unduly penalize some implementations.

This is another justification for simply sticking with the generic routines.


> +    li a3, 8
> +    blt a2, a3, .Lbytewise
> +
> +    li      t1, 0x0101010101010101
> +    slli    a4, t1, 7     /* a4 = 0x8080808080808080 */
> +    mul     t2, a1, t1    /* entirety of t2 is now repeats of target character;
> +                             assume mul is at worst no worse than 3*(shift+OR),
> +                             otherwise do that instead */
> +
> +/*
> + * strategy:
> + * t4 = ((*a0) ^ t2)
> + *      - now t4 contains zero bytes if and only if next word of memory
> + *        had target character at those positions
> + *
> + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
> + *      - all nonzero bytes of t4 become 0; zero bytes become 0x80
> + *
> + * if t4 is nonzero, find the index of the byte within it, add to a0 and return
> + * otherwise, loop
> + */
> +
> +1:
> +    ld t4, (a0)          /* t4 = load next 8 bytes */
> +    xor t4, t4, t2
> +    sub t5, t4, t1
> +    not t4, t4
> +    and t4, t5, t4
> +    and t4, t4, a4
> +    bnez t4, .Lbytewise  /* could use ctzw, mod+lookup or just binary chop
> +                            to locate byte of interest in t4 but profiling
> +                            shows these approaches are at best no better */
> +    addi a2, a2, -8
> +    addi a0, a0, 8
> +    bgeu a2, a3, 1b
> +    beqz a2, .Lnot_found
> +#endif // __riscv_strict_align
> +
> +/* too little data for a dword. mask calculation and branch mispredict costs
> +   make checking a word not worthwhile. degrade to bytewise search. */
> +
> +.Lbytewise:
> +    add t2, a0, a2
> +
> +1:
> +    lb t1, (a0)
> +    beq t1, a1, .Lfound
> +    addi a0, a0, 1
> +    blt a0, t2, 1b
> +
> +.Lnot_found:
> +    mv a0, zero
> +.Lfound:
> +    ret
> +
> +.Lvector_path:
> +    vsetvli t2, a2, e8, m2, ta, ma
> +
> +1:
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, a1
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_found
> +    add         a0, a0, t2
> +    sub         a2, a2, t2
> +    bge         a2, t2, 1b
> +    bnez        a2, 2f
> +    mv          a0, zero
> +    ret
> +
> +2:
> +    vsetvli t2, a2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, a1
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_found
> +    mv          a0, zero
> +    ret
> +
> +.Lvec_found:
> +    add a0, a0, t3
> +    ret
> +
> +.size   memchr, .-memchr
> +libc_hidden_builtin_def (memchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memcmp.S b/sysdeps/riscv/rv64/rvv/memcmp.S
> new file mode 100644
> index 0000000000..a945753a5f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memcmp.S
> @@ -0,0 +1,93 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +/* Optimised memcmp for riscv with vector extension
> + */
> +
> +.globl  memcmp
> +.type   memcmp,@function
> +
> +.align    2
> +
> +memcmp:
> +    mv t2, zero
> +    beqz a2, .Ldone
> +
> +    li          t1, 5            /* scalar path cheaper for 1-4 elts */
> +    bltu        a2, t1, .Lscalar
> +
> +    /* main loop, vlenb*2 elts at a time */
> +    vsetvli     t1, a2, e8, m2, ta, ma
> +
> +1:
> +    vle8.v      v2, (a0)        /* load elts */
> +    vle8.v      v4, (a1)
> +    vmsne.vv    v0, v2, v4      /* compare */
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_diff  /* found a difference ? */
> +    add         a0, a0, t1      /* not yet, advance everything */
> +    add         a1, a1, t1
> +    sub         a2, a2, t1
> +    bgeu        a2, t1, 1b
> +
> +    bnez        a2, .Ltail
> +    mv          a0, zero
> +    ret
> +
> +.Ltail:
> +    /* handle tail. we know a2 < vlenb*2 so just load and compare the lot */
> +    vsetvli     t1, a2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vle8.v      v4, (a1)
> +    vmsne.vv    v0, v2, v4
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lvec_diff
> +    mv          a0, zero         /* no diff found */
> +    ret
> +
> +.Lvec_diff:         /* v2, v4 differ at elt t3 */
> +    add a0, a0, t3
> +    add a1, a1, t3
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    sub a0, t0, t1
> +    ret
> +
> +.Lscalar:
> +    add  t3, a0, a2
> +
> +1:
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    sub t2, t0, t1
> +    bnez t2, .Ldone
> +    addi a0, a0, 1
> +    addi a1, a1, 1
> +    bltu a0, t3, 1b
> +
> +.Ldone:
> +    mv a0, t2
> +    ret
> +
> +
> +.size memcmp, .-memcmp
> +libc_hidden_builtin_def (memcmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memcpy.S b/sysdeps/riscv/rv64/rvv/memcpy.S
> new file mode 100644
> index 0000000000..7b37ec285d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memcpy.S
> @@ -0,0 +1,154 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +/* Optimised memcpy and memmove for riscv with vector extension
> + */
> +
> +.globl  memcpy
> +.type   memcpy,@function
> +.globl  memmove
> +.type   memmove,@function
> +
> +.align    2
> +memmove:
> +    bge     a0, a1, .Lmemcpy_rev
> +
> +memcpy:
> +.Lmemcpy_fwd:
> +    mv      t0, a0          /* t0 = preserve a0 so we can return it */
> +    csrr    t2, vlenb       /* t2 = number of bytes per vectorised copy op */
> +    slli    t5, t2,  1      /* t5 = number of bytes per loop */
> +    addi    t3, t5, -1      /* generate mask */
> +    not     t4, t3
> +    and     t4, a2, t4      /* t4 = bytes copied in vectorised pass */
> +
> +    beqz    t4, .Lscalar_fwd    /* size too small for even one pass? */
> +
> +    and    a2, a2, t3           /* a2 = bytes still left to copy after pass */
> +    add    t4, t4, a1           /* t4 = src at end of vectorised pass */
> +
> +1:
> +    vl2r.v  v2, (a1)            /* load, advance source */
> +    add     a1, a1, t5
> +    vs2r.v  v2, (t0)            /* store, advance dest */
> +    add     t0, t0, t5
> +    bltu    a1, t4, 1b          /* src at end? */
> +
> +    bltu    a2, t2, .Lscalar_fwd /* should we do one more vec load/store? */
> +    vl1r.v  v2, (a1)
> +    sub     a2, a2, t2
> +    add     a1, a1, t2
> +    vs1r.v  v2, (t0)
> +    add     t0, t0, t2
> +
> +.Lscalar_fwd:
> +    bnez    a2, .Lnobail
> +.Lbail:
> +    ret
> +.Lnobail:
> +
> +#ifndef __riscv_strict_align
> +    addi    t2, zero, 4
> +    bltu    a2, t2, .Lsingle_bytes
> +1:
> +    lw      t3, 0(a1)
> +    addi    a1, a1, 4
> +    sw      t3, 0(t0)
> +    addi    t0, t0, 4
> +    addi    a2, a2, -4
> +    bgeu    a2, t2, 1b
> +#endif // __riscv_strict_align
> +
> +.Lsingle_bytes:
> +    beqz    a2, .Lbail
> +    add     a2, a2, a1          /* a2 = src + remaining size */
> +1:
> +    lb      t1, 0(a1)
> +    sb      t1, 0(t0)
> +    addi    a1, a1, 1
> +    addi    t0, t0, 1
> +    bltu    a1, a2, 1b
> +    ret
> +.size   memcpy,     .-memcpy
> +
> +
> +.Lmemcpy_rev:
> +    beq     a0, a1, .Lmemcpy_rev_done
> +    add     t0, a0, a2          /* t0 = dest so we can return a0=dest later */
> +    add     t6, a1, a2          /* dest and src both point to byte */
> +                                /* immediately after end of buffer */
> +
> +    csrr    t2, vlenb           /* t2 = number of bytes per pass */
> +    slli    t5, t2,  1          /* t5 = number of bytes per entire loop */
> +    addi    t3, t5, -1          /* t3 = (bytes per loop) mask */
> +    not     t4, t3              /* generate mask for bytes processed by loop */
> +    and     t4, a2, t4          /* t4 = bytes copied in vectorised pass */
> +
> +    beqz    t4, .Lscalar_rev    /* size too small for even one pass? */
> +
> +    and    a2, a2, t3           /* a2 = bytes still left to copy after pass */
> +    sub    t4, t6, t4           /* t4 = src at end of vectorised pass */
> +
> +1:
> +    sub     t6, t6, t5
> +    sub     t0, t0, t5
> +    vl2r.v  v2, (t6)            /* load, advance source */
> +    vs2r.v  v2, (t0)            /* store, advance dest */
> +    bgtu    t6, t4, 1b          /* src at end? */
> +
> +    bltu    a2, t2, .Lscalar_rev /* should we do one more vec load/store? */
> +    sub     t6, t6, t2
> +    sub     t0, t0, t2
> +    sub     a2, a2, t2
> +    vl1r.v  v2, (t6)
> +    vs1r.v  v2, (t0)
> +
> +.Lscalar_rev:
> +#ifndef __riscv_strict_align
> +    beqz    a2, .Lbail
> +
> +    addi    t2, zero, 4
> +    bltu    a2, t2, 2f
> +1:
> +    addi    t6, t6, -4
> +    addi    t0, t0, -4
> +    addi    a2, a2, -4
> +    lw      t3, 0(t6)
> +    sw      t3, 0(t0)
> +    bgeu    a2, t2, 1b
> +2:
> +#endif // __riscv_strict_align
> +
> +    beqz    a2, .Lbail
> +1:
> +    addi    t6, t6, -1
> +    addi    t0, t0, -1
> +    lb      t1, 0(t6)
> +    sb      t1, 0(t0)
> +    bgtu    t0, a0, 1b
> +
> +.Lmemcpy_rev_done:
> +    ret
> +
> +.size   memmove, .-memmove
> +libc_hidden_builtin_def (memcpy)
> +libc_hidden_builtin_def (memmove)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/memmove.c b/sysdeps/riscv/rv64/rvv/memmove.c
> new file mode 100644
> index 0000000000..47734854f9
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memmove.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* memmove is implemented in memcpy.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/memset.S b/sysdeps/riscv/rv64/rvv/memset.S
> new file mode 100644
> index 0000000000..6f82c542b1
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/memset.S
> @@ -0,0 +1,89 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +
> +/* Optimised memset for riscv with vector extension
> + */
> +
> +.globl  memset
> +.type   memset,@function
> +
> +.align    2
> +memset:
> +    mv      t0, a0                  /* t0 = dest so we can return a0 later */
> +    vsetvli t2, a2, e8, m2, ta, ma  /* t2 = elts per copy */
> +    beqz    t2, .Lscalar
> +
> +    vmv.v.x v2, a1                  /* splat value across v2 */
> +
> +    slli    t3, t2, 1
> +    bgtu    t3, a2, .Lsinglestore
> +
> +1:
> +    vse8.v  v2, (t0)
> +    add     t0, t0, t2
> +    vse8.v  v2, (t0)
> +    add     t0, t0, t2
> +    sub     a2, a2, t3
> +    bgeu    a2, t3, 1b
> +    bgeu    a2, t2, .Lsinglestore
> +    bnez    a2, .Lscalar
> +
> +.Lbail:
> +    ret
> +
> +.Lsinglestore:
> +    bgtu    t2, a2, .Lscalar
> +    vse8.v  v2, (t0)
> +    add     t0, t0, t2
> +    sub     a2, a2, t2
> +
> +.Lscalar:
> +    beqz    a2, .Lbail
> +
> +#ifndef __riscv_strict_align
> +    slli    t2, a1, 8
> +    or      a1, a1, t2
> +    slli    t2, a1, 16
> +    or      a1, a1, t2
> +
> +    addi    t2, zero, 4
> +    bltu    a2, t2, 2f
> +
> +1:
> +    sw      a1, 0(t0)
> +    addi    t0, t0, 4
> +    addi    a2, a2, -4
> +    bgeu    a2, t2, 1b
> +2:
> +    beqz    a2, .Lbail
> +#endif // __riscv_strict_align
> +
> +    add     a2, a2, t0
> +1:
> +    sb      a1, 0(t0)
> +    addi    t0, t0, 1
> +    bltu    t0, a2, 1b
> +    ret
> +
> +.size   memset,     .-memset
> +libc_hidden_builtin_def (memset)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strchr.S b/sysdeps/riscv/rv64/rvv/strchr.S
> new file mode 100644
> index 0000000000..0b37174c55
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strchr.S
> @@ -0,0 +1,92 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strchr
> +.type   strchr,@function
> +
> +.globl  __strchrnul
> +.type   __strchrnul,@function
> +
> +/*
> + *  optimized strchr for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +__strchrnul:
> +    li t5, -1
> +    j  1f
> +
> +strchr:
> +    mv          t5, zero
> +1:  csrr        t1, vlenb              /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1             /* mask off unaligned part of pointer */
> +    and         t2, a0, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2             /* search however many bytes
> +                                          are needed to align the pointer */
> +    vsetvli     t2, t2, e8, m2, ta, mu
> +
> +    vle8.v      v2, (a0)               /* load data into v2(,v3) */
> +    vmseq.vx    v4, v2, zero
> +    vfirst.m    t4, v4
> +    vmsbf.m     v0, v4
> +    vmseq.vx    v0, v2, a1, v0.t
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    bgez        t4, .Lbufferend
> +    add         a0, a0, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu
> +    li          t4, -1
> +
> +1:
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v4, v2, zero
> +    vfirst.m    t4, v4
> +    vmsbf.m     v0, v4
> +    vmseq.vx    v0, v2, a1, v0.t
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    bgez        t4, .Lbufferend
> +    add         a0, a0, t1
> +    j           1b
> +
> +.Lfound:                                    /* found the target at a0+t3 */
> +    add         a0, a0, t3
> +    ret
> +
> +.Lbufferend:
> +    add         a0, a0, t4
> +    and         a0, a0, t5
> +    ret
> +
> +.size   strchr, .-strchr
> +.size   __strchrnul, .-__strchrnul
> +
> +libc_hidden_builtin_def (strchr)
> +weak_alias (__strchrnul, strchrnul)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strchrnul.c b/sysdeps/riscv/rv64/rvv/strchrnul.c
> new file mode 100644
> index 0000000000..259da80358
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strchrnul.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* strchrnul is implemented in strchr.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strcmp.S b/sysdeps/riscv/rv64/rvv/strcmp.S
> new file mode 100644
> index 0000000000..4a219221ac
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcmp.S
> @@ -0,0 +1,108 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strcmp
> +.type   strcmp,@function
> +
> +.align    2
> +
> +/* most of the time, one or both sides is unaligned and their alignments differ
> + * we need to check for a null terminator before crossing a page boundary
> + * strategy:
> + * - for each side, calculate masks for alignment and (vlenb * 2) - alignment
> + * - while no difference encountered:
> + * -   for each side:
> + * -       load bytes to end of next vlenb*2 block
> + * -       check for null terminator
> + * -       if no terminator, load bytes to fill rest of register
> + * -   compare sides
> + */
> +
> +strcmp:
> +    csrr        t1, vlenb                   /* find vlenb*2 */
> +    add         t1, t1, t1
> +    vsetvli     zero, t1, e8, m2, ta, mu
> +    vid.v       v30
> +    addi        t2, t1, -1         /* mask for unaligned part of ptr */
> +    and         t6, a0, t2         /* unaligned part of lhs */
> +    and         t5, a1, t2         /* unaligned part of rhs */
> +    sub         t6, t1, t6         /* safe number of lhs bytes to read */
> +    sub         t5, t1, t5         /* same, rhs */
> +    vmsltu.vx   v28, v30, t6       /* v28 = mask for first half of lhs load */
> +    vmsltu.vx   v26, v30, t5       /* v26 = mask for first half of rhs load */
> +    vmv.v.x     v16, zero
> +    vmv.v.x     v18, zero
> +
> +1:  vmv.v.v     v0, v28              /* lhs mask */
> +    vle8.v      v2, (a0), v0.t       /* masked load from lhs */
> +    vmseq.vx    v16, v2, zero, v0.t  /* check loaded bytes for null */
> +    vmv.v.v     v0, v26              /*       rhs mask */
> +    vfirst.m    t2, v16              /* get lhs check result */
> +    bgez        t2, .Ltail           /* bail if we can't safely check rest */
> +    vle8.v      v4, (a1), v0.t       /*    masked load from rhs */
> +    vmseq.vx    v18, v4, zero, v0.t  /*    check partial rhs for null */
> +    vmnot.m     v0, v28              /* mask for rest of lhs */
> +    vfirst.m    t3, v18              /* get check result */
> +    bltz        t3, 2f               /* test it */
> +                                     /* we see null terminator */
> +    bge         t3, t6, .Ltail       /* have enough bytes for vector cmp? */
> +
> +    vmsleu.vx   v0, v30, t3          /* select rest + null */
> +    vmsne.vv    v0, v2, v4, v0.t     /* compare */
> +    vfirst.m    t3, v0
> +    bgez        t3, 3f
> +    mv          a0, zero             /* no difference */
> +    ret
> +3:  add a0, a0, t3
> +    add a1, a1, t3
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +.Ldiff:
> +    sub a0, t0, t1
> +    ret
> +
> +    /* ...no null terminator */
> +2:  vle8.v      v2, (a0), v0.t       /* load rest of lhs */
> +    vmnot.m     v0, v26              /* mask for rest of rhs */
> +    vle8.v      v4, (a1), v0.t       /* load rest of rhs */
> +    vmsne.vv    v0, v2, v4           /* compare */
> +    add         a0, a0, t1           /* advance ptrs */
> +    vfirst.m    t3, v0
> +    add         a1, a1, t1
> +    bltz        t3, 1b
> +
> +    sub t3, t3, t1 /* found difference but we've already advanced a0 and a1 */
> +    j 3b
> +
> +.Ltail:
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    bne t0, t1, .Ldiff
> +    addi a0, a0, 1
> +    addi a1, a1, 1
> +    bnez t0, .Ltail
> +    mv a0, zero
> +    ret
> +
> +
> +.size strcmp, .-strcmp
> +libc_hidden_builtin_def (strcmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S b/sysdeps/riscv/rv64/rvv/strcpy.S
> new file mode 100644
> index 0000000000..b21909d66f
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
> @@ -0,0 +1,72 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strcpy
> +.type   strcpy,@function
> +
> +/*
> + *  optimized strcpy for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strcpy:
> +    mv          t0, a0                     /* copy dest so we can return it */
> +
> +    csrr        t1, vlenb                  /* find vlenb*2 */
> +    add         t1, t1, t1
> +
> +    addi        t2, t1, -1                 /* mask unaligned part of ptr */
> +    and         t2, a1, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                 /* search enough to align ptr */
> +    vsetvli     t2, t2, e8, m2, tu, mu
> +    vle8.v      v2, (a1)
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4                     /* copy but not past null */
> +    vfirst.m    t3, v4
> +    vse8.v      v2, (t0), v0.t
> +    bgez        t3, .Ldone
> +    add         t0, t0, t2
> +    add         a1, a1, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu   /* now do 2*vlenb bytes per pass */
> +
> +1:
> +    vle8.v      v2, (a1)
> +    add         a1, a1, t1
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4
> +    vfirst.m    t3, v4
> +    vse8.v      v2, (t0), v0.t
> +    add         t0, t0, t1
> +    bltz        t3, 1b
> +
> +.Ldone:
> +    ret
> +
> +.size   strcpy, .-strcpy
> +libc_hidden_builtin_def (strcpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c b/sysdeps/riscv/rv64/rvv/strcspn.c
> new file mode 100644
> index 0000000000..f0595a72fb
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
> @@ -0,0 +1,22 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +/* strcspn is implemented in strspn.S
> + */
> diff --git a/sysdeps/riscv/rv64/rvv/strlen.S b/sysdeps/riscv/rv64/rvv/strlen.S
> new file mode 100644
> index 0000000000..c77d500693
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strlen.S
> @@ -0,0 +1,67 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strlen
> +.type   strlen,@function
> +
> +/*
> + *  optimized strlen for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strlen:
> +    mv          t4, a0                    /* copy of buffer start */
> +    csrr        t1, vlenb                 /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1                /* mask off unaligned part of ptr */
> +    and         t2, a0, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                /* search fwd to align ptr */
> +    vsetvli     t2, t2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, ma  /* search 2*vlenb bytes per pass */
> +    add         t4, t4, t1
> +
> +1:
> +    vle8.v      v2, (a0)
> +    add         a0, a0, t1
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bltz        t3, 1b
> +
> +.Lfound:                                  /* found the 0; subtract          */
> +    sub         a0, a0, t4                /* buffer start from current ptr  */
> +    add         a0, a0, t3                /* and add offset into fetched    */
> +    ret                                   /* data to get length */
> +
> +.size   strlen, .-strlen
> +libc_hidden_builtin_def (strlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S b/sysdeps/riscv/rv64/rvv/strncmp.S
> new file mode 100644
> index 0000000000..863e5cb525
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
> @@ -0,0 +1,104 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strncmp
> +.type   strncmp,@function
> +
> +.align    2
> +
> +/* as strcmp, but with added checks on a2 (max count)
> + */
> +
> +strncmp:
> +    csrr        t1, vlenb                   /* find vlenb*2 */
> +    add         t1, t1, t1
> +    blt         a2, t1, .Ltail              /* degrade if max < vlenb*2 */
> +    vsetvli     zero, t1, e8, m2, ta, mu
> +    vid.v       v30
> +    addi        t2, t1, -1                  /* mask unaligned part of ptr */
> +    and         t6, a0, t2                  /* unaligned part of lhs */
> +    and         t5, a1, t2                  /* unaligned part of rhs */
> +    sub         t6, t1, t6                  /* safe count to read from lhs */
> +    sub         t5, t1, t5                  /* same, rhs */
> +    vmsltu.vx   v28, v30, t6                /* mask for first part of lhs */
> +    vmsltu.vx   v26, v30, t5                /* mask for first part of rhs */
> +    vmv.v.x     v16, zero
> +    vmv.v.x     v18, zero
> +
> +
> +1:  blt         a2, t1, .Ltail
> +    vmv.v.v     v0, v28                     /* lhs mask */
> +    vle8.v      v2, (a0), v0.t              /* masked load from lhs */
> +    vmseq.vx    v16, v2, zero, v0.t         /* check loaded bytes for null */
> +    vmv.v.v     v0, v26                     /*       rhs mask */
> +    vfirst.m    t2, v16                     /* get lhs check result */
> +    bgez        t2, .Ltail                  /* can we safely check rest */
> +    vle8.v      v4, (a1), v0.t              /*       masked load from rhs */
> +    vmseq.vx    v18, v4, zero, v0.t         /*       check partial rhs */
> +    vmnot.m     v0, v28                     /* mask for rest of lhs */
> +    vfirst.m    t3, v18                     /* get check result */
> +    bltz        t3, 2f                      /* test it */
> +    bge         t3, t6, .Ltail
> +
> +    vmsleu.vx   v0, v30, t3                 /* select rest of string + null */
> +    vmsne.vv    v0, v2, v4, v0.t            /* compare */
> +    vfirst.m    t3, v0
> +    bgez        t3, 3f
> +    mv          a0, zero
> +    ret
> +3:  add a0, a0, t3
> +    add a1, a1, t3
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +.Ldiff:
> +    sub a0, t0, t1
> +    ret
> +
> +    /* ...no null terminator in first part of lhs or rhs */
> +2:  vle8.v      v2, (a0), v0.t              /* load rest of lhs */
> +    vmnot.m     v0, v26                     /* mask for rest of rhs */
> +    vle8.v      v4, (a1), v0.t              /* load rest of rhs */
> +    vmsne.vv    v0, v2, v4                  /* compare */
> +    add         a0, a0, t1                  /* advance ptrs */
> +    vfirst.m    t3, v0
> +    add         a1, a1, t1
> +    sub         a2, a2, t1
> +    bltz        t3, 1b
> +
> +    sub t3, t3, t1  /* found a diff but we've already advanced a0 and a1 */
> +    j 3b
> +
> +.Ltail:
> +    beqz a2, 1f
> +    addi a2, a2, -1
> +    lbu t0, (a0)
> +    lbu t1, (a1)
> +    bne t0, t1, .Ldiff
> +    addi a0, a0, 1
> +    addi a1, a1, 1
> +    bnez t0, .Ltail
> +1:  mv a0, zero
> +    ret
> +
> +
> +.size strncmp, .-strncmp
> +libc_hidden_builtin_def (strncmp)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S b/sysdeps/riscv/rv64/rvv/strncpy.S
> new file mode 100644
> index 0000000000..8b3a1e545c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
> @@ -0,0 +1,96 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strncpy
> +.type   strncpy,@function
> +
> +/*
> + *  optimized strcpy for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strncpy:
> +    mv          t0, a0                    /* need to return dest so copy */
> +
> +    csrr        t1, vlenb                 /* find vlenb*2 */
> +    add         t1, t1, t1
> +
> +    addi        t2, t1, -1                /* mask off unaligned part of ptr */
> +    and         t2, a1, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2                /* search to align the pointer */
> +    vsetvli     zero, t2, e8, m2, tu, mu
> +    vle8.v      v2, (a1)
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4                    /* copy to dest */
> +    vfirst.m    t3, v4
> +    bgeu        t2, a2, .Ldest_full
> +    vse8.v      v2, (t0), v0.t
> +    bgez        t3, .Lterminator_found
> +    add         t0, t0, t2
> +    add         a1, a1, t2
> +    sub         a2, a2, t2
> +    beqz        a2, .Ldone
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:
> +    vle8.v      v2, (a1)
> +    add         a1, a1, t1
> +    vmseq.vx    v4, v2, zero
> +    vmsif.m     v0, v4
> +    vfirst.m    t3, v4
> +    bgeu        t1, a2, .Ldest_full
> +    vse8.v      v2, (t0), v0.t
> +    add         t0, t0, t1
> +    sub         a2, a2, t1
> +    bltz        t3, 1b
> +    sub         t0, t0, t1
> +
> +.Lterminator_found:
> +    addi        sp, sp, -16
> +    sd          ra, 0(sp)
> +    sd          a0, 8(sp)
> +    add         a0, t0, t3
> +    mv          a1, zero
> +    sub         a2, a2, t3
> +    jal         ra, memset
> +    ld          ra, 0(sp)
> +    ld          a0, 8(sp)
> +    addi        sp, sp, 16
> +.Ldone:
> +    ret
> +
> +.Ldest_full:
> +    vid.v       v6
> +    vmsltu.vx   v4, v6, a2
> +    vmand.mm    v0, v0, v4
> +    vse8.v      v2, (t0), v0.t
> +    ret
> +
> +.size   strncpy, .-strncpy
> +libc_hidden_builtin_def (strncpy)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S b/sysdeps/riscv/rv64/rvv/strnlen.S
> new file mode 100644
> index 0000000000..6d7ee65c7a
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
> @@ -0,0 +1,81 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  __strnlen
> +.type   __strnlen,@function
> +
> +/* vector optimized strnlen
> + * assume it's safe to read to the end of the page
> + * containing either a null terminator or the last byte of the count or both,
> + * but not past it
> + * assume page size >= vlenb*2
> + */
> +
> +.align    2
> +__strnlen:
> +    mv          t4, a0               /* stash a copy of start for later */
> +    beqz        a1, .LzeroCount
> +
> +    csrr        t1, vlenb            /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1           /* mask off unaligned part of ptr */
> +    and         t2, a1, a0
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2           /* search to align pointer to t1 */
> +    bgeu        t2, a1, 2f           /* check it's safe */
> +    mv          t2, a1               /* it's not! look as far as permitted */
> +2:  vsetvli     t2, t2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t2
> +    sub         a1, a1, t2
> +    bltu        a1, t1, .LreachedCount
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, ma    /* do 2*vlenb bytes per pass */
> +
> +1:  vle8.v      v2, (a0)
> +    sub         a1, a1, t1
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t1
> +    bgeu        a1, t1, 1b
> +.LreachedCount:
> +    mv          t2, a1    /* in case 0 < a1 < t1 */
> +    bnez        a1, 2b    /* if so, still t2 bytes to check, all safe */
> +.LzeroCount:
> +    sub         a0, a0, t4
> +    ret
> +
> +.Lfound:        /* found the 0; subtract buffer start from current pointer */
> +    add         a0, a0, t3 /* and add offset into fetched data */
> +    sub         a0, a0, t4
> +    ret
> +
> +.size   __strnlen, .-__strnlen
> +weak_alias (__strnlen, strnlen)
> +libc_hidden_builtin_def (__strnlen)
> +libc_hidden_builtin_def (strnlen)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S b/sysdeps/riscv/rv64/rvv/strrchr.S
> new file mode 100644
> index 0000000000..4bef8a3b9c
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
> @@ -0,0 +1,88 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strrchr
> +.type   strrchr,@function
> +
> +/*
> + *  optimized strrchr for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 2*vlenb
> + */
> +
> +.align    2
> +strrchr:
> +    mv          t5, a0    /* stash buffer ptr somewhere safe */
> +    mv          a0, zero  /* result is nullptr unless we find better below */
> +
> +    csrr        t1, vlenb                /* determine vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1               /* mask off unaligned part of ptr */
> +    and         t2, t5, t2
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2               /* search to align ptr to 2*vlenb */
> +    vsetvli     t2, t2, e8, m2, ta, mu
> +
> +    vle8.v      v2, (t5)                 /* load data into v2(,v3) */
> +    vmseq.vx    v4, v2, zero             /* check for null terminator */
> +    vfirst.m    t4, v4                   /* grab its position, if any */
> +    vmsbf.m     v0, v4                   /* select valid chars */
> +    vmseq.vx    v0, v2, a1, v0.t         /* search for candidate byte */
> +    vfirst.m    t3, v0                   /* grab its position, if any */
> +    bltz        t3, 2f                   /* did we find a candidate? */
> +
> +3:  add         a0, t3, t5               /* we did! grab the address */
> +    vmsof.m     v1, v0                   /* there might be more than one */
> +    vmandn.mm   v0, v0, v1               /* so clear the one we just found */
> +    vfirst.m    t3, v0                   /* is there another? */
> +    bgez        t3, 3b
> +
> +2:  bgez        t4, .Ldone               /* did we see a null terminator? */
> +    add         t5, t5, t2
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
> +
> +1:  vle8.v      v2, (t5)
> +    vmseq.vx    v4, v2, zero
> +    vfirst.m    t4, v4
> +    vmsbf.m     v0, v4
> +    vmseq.vx    v0, v2, a1, v0.t
> +    vfirst.m    t3, v0
> +    bltz        t3, 2f
> +
> +3:  add         a0, t3, t5
> +    vmsof.m     v1, v0
> +    vmandn.mm   v0, v0, v1
> +    vfirst.m    t3, v0
> +    bgez        t3, 3b
> +
> +2:  add         t5, t5, t1
> +    bltz        t4, 1b
> +
> +.Ldone:
> +    ret
> +
> +.size   strrchr, .-strrchr
> +libc_hidden_builtin_def (strrchr)
> \ No newline at end of file
> diff --git a/sysdeps/riscv/rv64/rvv/strspn.S b/sysdeps/riscv/rv64/rvv/strspn.S
> new file mode 100644
> index 0000000000..2b9af5cc2d
> --- /dev/null
> +++ b/sysdeps/riscv/rv64/rvv/strspn.S
> @@ -0,0 +1,189 @@
> +
> +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <sysdep.h>
> +
> +.globl  strspn
> +.type   strspn,@function
> +
> +.globl  strcspn
> +.type   strcspn,@function
> +
> +/*
> + *  optimized strspn / strcspn for riscv with vector extension
> + *  assumptions:
> + *  - vlenb is a power of 2
> + *  - page size >= 32
> + *  strategy:
> + *  - build a 256-bit table on the stack, where each elt is zero
> + *    if encountering it should terminate computation and nonzero otherwise
> + *  - use vectorised lookups into this to check 2*vlen elts at a time;
> + *    this code is identical for strspan and strcspan and can be shared
> + *
> + *  note that while V mandates at least 128 bit wide regs,
> + *  we are building a 256 bit lookup table
> + *  therefore we use either LMUL=1 or 2 depending on what the target supports
> + *  therefore we only use even vector register numbers,
> + *  so everything still works if we go with LMUL=2
> + */
> +
> +# -----------------------------
> +
> +.align    2
> +
> +strspn:
> +    lbu         t0, 0(a1)
> +    bnez        t0, .Lbuild_table
> +    mv          a0, zero
> +    ret
> +
> +.Lbuild_table:
> +    mv          a6, a0 /* store incoming a0 */
> +    li          t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
> +
> +    vsetvli     zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> +    /* we want to build a 256-bit table, so use vlenb*2,
> +     * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
> +     * 'V' extension specifies a minimum vlen of 128 so this should cover
> +     * all cases; we can skip the check if we know vlen >= 256 at compile time
> +     */
> +    csrr        t2, vlenb
> +    bgeu        t2, t1, 1f
> +    vsetvli     zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> +    /* read one char from the charset at a time and write the correct bit
> +     * in the lookup table; we could do SIMD iff we ever get an extension
> +     * that provides some way of scattering bytes into a reg group
> +     */
> +    vmv.v.x     v16, zero       /* clear out table */
> +    vmv.v.x     v8, zero        /* clear out v8 */
> +    li          t3, 1
> +    vmv.s.x     v8, t3          /* v8 now all zeroes except bottom byte */
> +
> +1:  vmv.v.x     v2, zero        /* clear out v2 */
> +    addi        a1, a1, 1       /* advance charset ptr */
> +    srli        t2, t0, 3       /* divide the byte we read earlier by 8 */
> +    vslideup.vx v2, v8, t2      /* v2 now 1 in the correct byte 0 elsewhere */
> +    vsll.vx     v2, v2, t0      /* v2 now 1 in the correct bit, 0 elsewhere */
> +    vor.vv      v16, v16, v2    /* or it in */
> +    lbu         t0, 0(a1)       /* fetch next bute */
> +    bnez        t0, 1b          /* if it's null, go round again */
> +
> +/*
> + *   Table is now built in v16.
> + *   Strategy:
> + *   - fetch next t1 bytes from memory
> + *   - vrgather on their values divided by 8 to get relevant bytes of table
> + *   - shift right to get the correct bit into bit 1
> + *   - and with 1, compare with expected terminator value, then check mask
> + *     to see if we've found a terminator
> + *
> + *   Before we can begin, a0 needs to be t1-aligned, so that when we fetch
> + *   the next t1 bytes - any of which may be the null terminator -
> + *   we do not cross a page boundary and read unmapped memory. Therefore
> + *   we have one read of however many bytes are needed to align a0,
> + *   before the main loop.
> + */
> +
> +.Lscan_table:
> +    vmv.v.x     v8, t3              /* v8 now t1 bytes of 0x01 */
> +
> +    and         t2, a0, t1          /* mask to align to t1 */
> +    beqz        t2, 2f              /* or skip if we're already aligned */
> +    sub         t2, t1, t2          /* t2 now bytes to read to align to t1 */
> +
> +    vid.v       v2                  /* build mask instead of changing vl */
> +    vmsltu.vx   v0, v2, t2          /* so we don't need to track LMUL */
> +
> +    vle8.v      v2, (a0), v0.t      /* load next bytes from input */
> +    vsrl.vi     v4, v2, 3           /* divide by 8 */
> +    vrgather.vv v6, v16, v4         /* corresponding bytes of bit table */
> +    vsrl.vv     v6, v6, v2          /* shift correct bits to lsb */
> +    vand.vv     v6, v6, v8          /* and with 1 to complete the lookups */
> +    vmseq.vx    v4, v6, zero, v0.t  /* check to see if any 0s are present */
> +    vfirst.m    t0, v4              /* index of the first 0, if any */
> +    bgez        t0, .Lscan_end      /* if we found one, stop */
> +    add         a0, a0, t2          /* advance by number of bytes we read */
> +
> +2:  add         a6, a6, t1     /* we'll advance a0 before the exit check */
> +1:  vle8.v      v2, (a0)       /* as above but unmasked so t1 elts per pass */
> +    add         a0, a0, t1
> +
> +    vsrl.vi     v4, v2, 3
> +    vrgather.vv v6, v16, v4
> +    vsrl.vv     v6, v6, v2
> +    vand.vv     v6, v6, v8
> +
> +    vmseq.vx    v4, v6, zero
> +    vfirst.m    t0, v4
> +    bltz        t0, 1b
> +
> +.Lscan_end:
> +    add         a0, a0, t0     /* calculate offset to terminating byte */
> +    sub         a0, a0, a6
> +    ret
> +.size   strspn, .-strspn
> +
> +/* strcspn
> + *
> + * table build exactly as for strspn, except:
> + * - the lookup table starts with all bits except bit 0 of byte 0 set
> + * - we clear the corresponding bit for each byte in the charset
> + * once table is built, we can reuse the scan code directly
> + */
> +
> +strcspn:
> +    lbu         t0, 0(a1)
> +    beqz        t0, strlen   /* no rejections -> prefix is whole string */
> +
> +    mv          a6, a0
> +    li          t1, 32
> +
> +    vsetvli     zero, t1, e8, m1, tu, mu
> +#if __riscv_v_min_vlen < 256
> +    csrr        t2, vlenb
> +    bgeu        t2, t1, 1f
> +    vsetvli     zero, t1, e8, m2, tu, mu
> +1:
> +#endif // __riscv_v_min_vlen
> +
> +    vmv.v.x     v8, zero
> +    li          t3, 1           /* all bits clear except bit 0 of byte 0 */
> +    vmv.s.x     v8, t3
> +    vnot.v      v16, v8         /* v16 is the inverse of that */
> +    li          t4, -1
> +
> +1:  vmv.v.x     v2, zero
> +    addi        a1, a1, 1       /* advance charset ptr */
> +    srli        t2, t0, 3       /* select correct bit in v2 */
> +    vslideup.vx v2, v8, t2
> +    vsll.vx     v2, v2, t0
> +    vnot.v      v2, v2          /* invert */
> +    vand.vv     v16, v16, v2    /* clear the relevant bit of table */
> +    lbu         t0, 0(a1)
> +    bnez        t0, 1b
> +    j           .Lscan_table
> +.size   strcspn, .-strcspn
> +
> +libc_hidden_builtin_def (strspn)
> +libc_hidden_builtin_def (strcspn)
> \ No newline at end of file
> --
> 2.34.1
>
  
Sergei Lewis Feb. 2, 2023, 9:34 a.m. UTC | #10
Thank you for the feedback!

The idea of this changeset is to provide vectorised implementations for the
specific case where someone opts in by explicitly describing the build
target as supporting the "V" extension via the build environment when
building glibc; where this is not done, existing behaviour is unchanged.

I agree that multiarch/ifuncs absolutely has to be the way going forward.
Most people in the wild will be using prebuilt binaries, and generally
these will be built for the lowest common denominator; we do want to use
optimal code where we can detect the target supports it even if this was
not known at build time. Moreover, we can reasonably support much more
specific optimisation (e.g. specialise on VLENB, determine whether high
LMUL or low LMUL / more unrolls is preferable etc) via ifuncs than at
compile time.

Even once the compiler issues are sorted out, coming up with a robust
mechanism for probing the relevant properties of the environment for ifuncs
is a significant engineering challenge on RISCV, and one that I expect to
be addressing going forward. Once that is in place, it will be possible via
ifuncs to make use of the vectorised implementations even in binaries where
the V extension was not explicitly target at compile time, if it is
available at runtime.

Getting the optimised implementations out early for anyone opting in to
them will help anyone else working in this space and anyone looking at
benchmarks on specific targets; it would also get the code in front of more
eyeballs than just mine while I crack on with ifuncs.

Certainly, though, if the ifuncs infrastructure has already been created by
someone else and can be shared, that would be super helpful, and if the
community would prefer I keep these private until that point we can do that.

On Wed, Feb 1, 2023 at 5:07 PM Jeff Law <jeffreyalaw@gmail.com> wrote:

>
>
> On 2/1/23 09:42, Florian Weimer wrote:
> > * Jeff Law via Libc-alpha:
> >
> >> On 2/1/23 02:52, Sergei Lewis wrote:
> >>> Initial implementations of memchr, memcmp, memcpy, memmove, memset,
> strchr,
> >>> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> >>> targeting the riscv "V" extension, version 1.0
> >>> The vectorised implementations assume VLENB of at least 128 and at
> >>> least 32
> >>> registers (as mandated by the "V" extension spec). They also assume
> that
> >>> VLENB is a power of two which is no larger than the page size, and (as
> >>> vectorised code in glibc for other platforms does) that it is safe to
> read
> >>> past null terminators / buffer ends provided one does not cross a page
> >>> boundary.
> >>> Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
> >>> ---
> >>>    sysdeps/riscv/rv64/rvv/Implies     |   2 +
> >>>    sysdeps/riscv/rv64/rvv/memchr.S    | 127 +++++++++++++++++++
> >>>    sysdeps/riscv/rv64/rvv/memcmp.S    |  93 ++++++++++++++
> >>>    sysdeps/riscv/rv64/rvv/memcpy.S    | 154 +++++++++++++++++++++++
> >>>    sysdeps/riscv/rv64/rvv/memmove.c   |  22 ++++
> >>>    sysdeps/riscv/rv64/rvv/memset.S    |  89 ++++++++++++++
> >>>    sysdeps/riscv/rv64/rvv/strchr.S    |  92 ++++++++++++++
> >>>    sysdeps/riscv/rv64/rvv/strchrnul.c |  22 ++++
> >>>    sysdeps/riscv/rv64/rvv/strcmp.S    | 108 +++++++++++++++++
> >>>    sysdeps/riscv/rv64/rvv/strcpy.S    |  72 +++++++++++
> >>>    sysdeps/riscv/rv64/rvv/strcspn.c   |  22 ++++
> >>>    sysdeps/riscv/rv64/rvv/strlen.S    |  67 ++++++++++
> >>>    sysdeps/riscv/rv64/rvv/strncmp.S   | 104 ++++++++++++++++
> >>>    sysdeps/riscv/rv64/rvv/strncpy.S   |  96 +++++++++++++++
> >>>    sysdeps/riscv/rv64/rvv/strnlen.S   |  81 +++++++++++++
> >>>    sysdeps/riscv/rv64/rvv/strrchr.S   |  88 ++++++++++++++
> >>>    sysdeps/riscv/rv64/rvv/strspn.S    | 189
> +++++++++++++++++++++++++++++
> >> Does this need to be revamped given the recent push to do more with
> >> generic code and target specific hooks for mem* and str*?
> >>
> >> Shouldn't the implementations be in a multiarch directory?  I would
> >> fully expect we're going to need both a vector and scalar
> >> implementation selected by an ifunc.
> >
> > I think most RISC-V GCC compilers won't have enabled IFUNC support?
> > Looking at gcc/config.gcc in GCC 12, I see this:
> >
> > *-*-linux* | *-*-gnu*)
> >          case ${target} in
> >          aarch64*-* | arm*-* | i[34567]86-* | powerpc*-* | s390*-* |
> sparc*-* | x86_64-* | loongarch*-*)
> >                  default_gnu_indirect_function=yes
> >                  ;;
> >          esac
> >
> > But maybe that's not the right place to look at?
> Clearly something we need to fix.
>
> I'd hesitate to turn on the gcc bits without having the kernel/user
> interface settled.  There was a proposal that added a syscall to get the
> processor capabilities -- I'd asked the authors to reach out to you and
> Carlos on whether or not that was acceptable for glibc.  I'm not sure if
> that happened or not.
>
> >
> > We have an assembler hack to be able to still build IFUNC resolvers
> > written in C, but I don't know if this works on RISC-V.
> It probably doesn't yet.
>
> >
> > Ideally the GCC defaults would change, too, and well before IFUNCs are
> > in common use.
> They're not common, but I suspect that'll change in the next ~6 months.
>
> Jeff
>
  
Sergei Lewis Feb. 2, 2023, 10:02 a.m. UTC | #11
Thank you very much for the detailed review!

> > +#ifndef __riscv_strict_align
> Would this be defined by compiler as predefine macro or is it just a debug
> switch? If the later, I think it would be better to remove it.

The intent is to make use of the gcc feature in flight here:
https://gcc.gnu.org/pipermail/gcc-patches/2023-January/610115.html to
detect the situation where the build environment has been configured to
avoid unaligned access.


> It is really worth to add a strrchr optimization?  The generic
implementation
> already calls strchr (which should be optimized).

The performance win is actually quite significant; consider searching for
the first space in a piece of text compared to the last space - reusing
strchr in a loop as the generic implementation does would cause branches
every few bytes of input, and essentially destroy all benefits gained from
vectorisation. Worse, vectorised functions typically have a few
instructions of setup/overhead before the work begins and these would
likewise be repeated every few bytes of input.

> I wonder if we could adapt the generic implementation, so riscv only
reimplements
> the vectorized search instead of all the boilerplace to generate the
table and
> early tests.

The issue is that the table looks different for different implementations,
and possibly even for different cases in the same implementation; e.g. some
of the existing implementations use a 256-byte table with one byte per
character rather than a 256-bit bitfield as I do here (and going forward we
would potentially want such a path for riscv as well and select between
them based on the length of the character set - common use in parsing will
tend to produce very small character sets, but if we get a large one or
potentially always depending on architecture, using indexed loads/stores
will become faster than the bitfield approach I use here).

I am integrating all the other feedback, and will also work with your
changes to the generic implementations - it looks like there is quite a bit
of potential to reduce and simplify my changeset once yours goes in.

On Wed, Feb 1, 2023 at 5:38 PM Adhemerval Zanella Netto <
adhemerval.zanella@linaro.org> wrote:

>
>
> On 01/02/23 06:52, Sergei Lewis wrote:
> > Initial implementations of memchr, memcmp, memcpy, memmove, memset,
> strchr,
> > strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> > targeting the riscv "V" extension, version 1.0
> >
> > The vectorised implementations assume VLENB of at least 128 and at least
> 32
> > registers (as mandated by the "V" extension spec). They also assume that
> > VLENB is a power of two which is no larger than the page size, and (as
> > vectorised code in glibc for other platforms does) that it is safe to
> read
> > past null terminators / buffer ends provided one does not cross a page
> > boundary.
> >
> > Signed-off-by: Sergei Lewis <slewis@rivosinc.com>
>
> Some comments that might be useful since I am working the generic
> implementations
> below.
>
> Also, I think it should be splitted with one implementation per patch,
> unless the
> implementation is tied together (as for strchr/strchrnul for instance).
> Does
> the vectorized routine only work for rv64?
>
> > ---
> >  sysdeps/riscv/rv64/rvv/Implies     |   2 +
> >  sysdeps/riscv/rv64/rvv/memchr.S    | 127 +++++++++++++++++++
> >  sysdeps/riscv/rv64/rvv/memcmp.S    |  93 ++++++++++++++
> >  sysdeps/riscv/rv64/rvv/memcpy.S    | 154 +++++++++++++++++++++++
> >  sysdeps/riscv/rv64/rvv/memmove.c   |  22 ++++
> >  sysdeps/riscv/rv64/rvv/memset.S    |  89 ++++++++++++++
> >  sysdeps/riscv/rv64/rvv/strchr.S    |  92 ++++++++++++++
> >  sysdeps/riscv/rv64/rvv/strchrnul.c |  22 ++++
> >  sysdeps/riscv/rv64/rvv/strcmp.S    | 108 +++++++++++++++++
> >  sysdeps/riscv/rv64/rvv/strcpy.S    |  72 +++++++++++
> >  sysdeps/riscv/rv64/rvv/strcspn.c   |  22 ++++
> >  sysdeps/riscv/rv64/rvv/strlen.S    |  67 ++++++++++
> >  sysdeps/riscv/rv64/rvv/strncmp.S   | 104 ++++++++++++++++
> >  sysdeps/riscv/rv64/rvv/strncpy.S   |  96 +++++++++++++++
> >  sysdeps/riscv/rv64/rvv/strnlen.S   |  81 +++++++++++++
> >  sysdeps/riscv/rv64/rvv/strrchr.S   |  88 ++++++++++++++
> >  sysdeps/riscv/rv64/rvv/strspn.S    | 189 +++++++++++++++++++++++++++++
> >  17 files changed, 1428 insertions(+)
> >  create mode 100644 sysdeps/riscv/rv64/rvv/Implies
> >  create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
> >  create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
> >  create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
> >
> > diff --git a/sysdeps/riscv/rv64/rvv/Implies
> b/sysdeps/riscv/rv64/rvv/Implies
> > new file mode 100644
> > index 0000000000..b07b4cb906
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/Implies
> > @@ -0,0 +1,2 @@
> > +riscv/rv64/rvd
> > +
> > diff --git a/sysdeps/riscv/rv64/rvv/memchr.S
> b/sysdeps/riscv/rv64/rvv/memchr.S
> > new file mode 100644
> > index 0000000000..a7e32b8f25
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/memchr.S
> > @@ -0,0 +1,127 @@
> > +
>
> Spurious new line at the start.  We also require a brief comment describing
> the file contents for newer files.
>
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
>
> Not sure 2012 range fits here.
>
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +
> > +/* Optimised memchr for riscv with vector extension
> > + * Assumptions:
> > + *    - cpu becomes bandwidth limited at or before
> > + *            2 vector register sized read/write operations
> > + *          + 2 scalar operations
> > + *          + conditional branch
> > + */
> > +
> > +.globl  memchr
> > +.type   memchr,@function
> > +
> > +.align    2
> > +memchr:
>
> We have the ENTRY macro for that.
>
> > +    beqz a2, .Lnot_found
>
> Maybe use the L macro here for local labels;
>
> > +    csrr    t1, vlenb
> > +    bgeu    a2, t1, .Lvector_path   /* only use vector path if we're
> scanning
> > +                                       at least vlenb bytes */
> > +
> > +#ifndef __riscv_strict_align
>
> Would this be defined by compiler as predefine macro or is it just a debug
> switch? If the later, I think it would be better to remove it.
>
> > +    li a3, 8
> > +    blt a2, a3, .Lbytewise
> > +
> > +    li      t1, 0x0101010101010101
> > +    slli    a4, t1, 7     /* a4 = 0x8080808080808080 */
> > +    mul     t2, a1, t1    /* entirety of t2 is now repeats of target
> character;
> > +                             assume mul is at worst no worse than
> 3*(shift+OR),
> > +                             otherwise do that instead */
> > +
> > +/*
> > + * strategy:
> > + * t4 = ((*a0) ^ t2)
> > + *      - now t4 contains zero bytes if and only if next word of memory
> > + *        had target character at those positions
> > + *
> > + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
> > + *      - all nonzero bytes of t4 become 0; zero bytes become 0x80
> > + *
> > + * if t4 is nonzero, find the index of the byte within it, add to a0
> and return
> > + * otherwise, loop
> > + */
> > +
> > +1:
> > +    ld t4, (a0)          /* t4 = load next 8 bytes */
> > +    xor t4, t4, t2
> > +    sub t5, t4, t1
> > +    not t4, t4
> > +    and t4, t5, t4
> > +    and t4, t4, a4
> > +    bnez t4, .Lbytewise  /* could use ctzw, mod+lookup or just binary
> chop
> > +                            to locate byte of interest in t4 but
> profiling
> > +                            shows these approaches are at best no
> better */
> > +    addi a2, a2, -8
> > +    addi a0, a0, 8
> > +    bgeu a2, a3, 1b
> > +    beqz a2, .Lnot_found
> > +#endif // __riscv_strict_align
> > +
> > +/* too little data for a dword. mask calculation and branch mispredict
> costs
> > +   make checking a word not worthwhile. degrade to bytewise search. */
> > +
> > +.Lbytewise:
> > +    add t2, a0, a2
> > +
> > +1:
> > +    lb t1, (a0)
> > +    beq t1, a1, .Lfound
> > +    addi a0, a0, 1
> > +    blt a0, t2, 1b
> > +
> > +.Lnot_found:
> > +    mv a0, zero
> > +.Lfound:
> > +    ret
> > +
> > +.Lvector_path:
> > +    vsetvli t2, a2, e8, m2, ta, ma
> > +
> > +1:
> > +    vle8.v      v2, (a0)
> > +    vmseq.vx    v0, v2, a1
> > +    vfirst.m    t3, v0
> > +    bgez        t3, .Lvec_found
> > +    add         a0, a0, t2
> > +    sub         a2, a2, t2
> > +    bge         a2, t2, 1b
> > +    bnez        a2, 2f
> > +    mv          a0, zero
> > +    ret
> > +
> > +2:
> > +    vsetvli t2, a2, e8, m2, ta, ma
> > +    vle8.v      v2, (a0)
> > +    vmseq.vx    v0, v2, a1
> > +    vfirst.m    t3, v0
> > +    bgez        t3, .Lvec_found
> > +    mv          a0, zero
> > +    ret
> > +
> > +.Lvec_found:
> > +    add a0, a0, t3
> > +    ret
> > +
> > +.size   memchr, .-memchr
> > +libc_hidden_builtin_def (memchr)
> > \ No newline at end of file
>
> Please add a newline.
>
> > diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S
> b/sysdeps/riscv/rv64/rvv/strcpy.S
> > new file mode 100644
> > index 0000000000..b21909d66f
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
> > @@ -0,0 +1,72 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
>
> You can add a optimize stpcpy and use to implement strcpy on top of that
> (as my generic proposal does [1]). ARMv6 does something similar [2]
>
> [1]
> https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-12-adhemerval.zanella@linaro.org/
> [2]
> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/arm/armv6/strcpy.S;h=e9f63a56c1c605a21b05f7ac21412585b0705171;hb=HEAD
>
> > +#include <sysdep.h>
> > +
> > +.globl  strcpy
> > +.type   strcpy,@function
> > +
> > +/*
> > + *  optimized strcpy for riscv with vector extension
> > + *  assumptions:
> > + *  - vlenb is a power of 2
> > + *  - page size >= 2*vlenb
> > + */
> > +
> > +.align    2
> > +strcpy:
> > +    mv          t0, a0                     /* copy dest so we can
> return it */
> > +
> > +    csrr        t1, vlenb                  /* find vlenb*2 */
> > +    add         t1, t1, t1
> > +
> > +    addi        t2, t1, -1                 /* mask unaligned part of
> ptr */
> > +    and         t2, a1, t2
> > +    beqz        t2, .Laligned
> > +
> > +    sub         t2, t1, t2                 /* search enough to align
> ptr */
> > +    vsetvli     t2, t2, e8, m2, tu, mu
> > +    vle8.v      v2, (a1)
> > +    vmseq.vx    v4, v2, zero
> > +    vmsif.m     v0, v4                     /* copy but not past null */
> > +    vfirst.m    t3, v4
> > +    vse8.v      v2, (t0), v0.t
> > +    bgez        t3, .Ldone
> > +    add         t0, t0, t2
> > +    add         a1, a1, t2
> > +
> > +.Laligned:
> > +    vsetvli     zero, t1, e8, m2, ta, mu   /* now do 2*vlenb bytes per
> pass */
> > +
> > +1:
> > +    vle8.v      v2, (a1)
> > +    add         a1, a1, t1
> > +    vmseq.vx    v4, v2, zero
> > +    vmsif.m     v0, v4
> > +    vfirst.m    t3, v4
> > +    vse8.v      v2, (t0), v0.t
> > +    add         t0, t0, t1
> > +    bltz        t3, 1b
> > +
> > +.Ldone:
> > +    ret
> > +
> > +.size   strcpy, .-strcpy
> > +libc_hidden_builtin_def (strcpy)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c
> b/sysdeps/riscv/rv64/rvv/strcspn.c
> > new file mode 100644
> > index 0000000000..f0595a72fb
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
> > @@ -0,0 +1,22 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +/* strcspn is implemented in strspn.S
> > + */
> > diff --git a/sysdeps/riscv/rv64/rvv/strlen.S
> b/sysdeps/riscv/rv64/rvv/strlen.S
> > new file mode 100644
> > index 0000000000..c77d500693
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strlen.S
> > @@ -0,0 +1,67 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl  strlen
> > +.type   strlen,@function
> > +
> > +/*
> > + *  optimized strlen for riscv with vector extension
> > + *  assumptions:
> > + *  - vlenb is a power of 2
> > + *  - page size >= 2*vlenb
> > + */
> > +
> > +.align    2
> > +strlen:
> > +    mv          t4, a0                    /* copy of buffer start */
> > +    csrr        t1, vlenb                 /* find vlenb*2 */
> > +    add         t1, t1, t1
> > +    addi        t2, t1, -1                /* mask off unaligned part of
> ptr */
> > +    and         t2, a0, t2
> > +    beqz        t2, .Laligned
> > +
> > +    sub         t2, t1, t2                /* search fwd to align ptr */
> > +    vsetvli     t2, t2, e8, m2, ta, ma
> > +    vle8.v      v2, (a0)
> > +    vmseq.vx    v0, v2, zero
> > +    vfirst.m    t3, v0
> > +    bgez        t3, .Lfound
> > +    add         a0, a0, t2
> > +
> > +.Laligned:
> > +    vsetvli     zero, t1, e8, m2, ta, ma  /* search 2*vlenb bytes per
> pass */
> > +    add         t4, t4, t1
> > +
> > +1:
> > +    vle8.v      v2, (a0)
> > +    add         a0, a0, t1
> > +    vmseq.vx    v0, v2, zero
> > +    vfirst.m    t3, v0
> > +    bltz        t3, 1b
> > +
> > +.Lfound:                                  /* found the 0; subtract
>     */
> > +    sub         a0, a0, t4                /* buffer start from current
> ptr  */
> > +    add         a0, a0, t3                /* and add offset into
> fetched    */
> > +    ret                                   /* data to get length */
> > +
> > +.size   strlen, .-strlen
> > +libc_hidden_builtin_def (strlen)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S
> b/sysdeps/riscv/rv64/rvv/strncmp.S
> > new file mode 100644
> > index 0000000000..863e5cb525
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
> > @@ -0,0 +1,104 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl  strncmp
> > +.type   strncmp,@function
> > +
> > +.align    2
> > +
> > +/* as strcmp, but with added checks on a2 (max count)
> > + */
> > +
> > +strncmp:
> > +    csrr        t1, vlenb                   /* find vlenb*2 */
> > +    add         t1, t1, t1
> > +    blt         a2, t1, .Ltail              /* degrade if max < vlenb*2
> */
> > +    vsetvli     zero, t1, e8, m2, ta, mu
> > +    vid.v       v30
> > +    addi        t2, t1, -1                  /* mask unaligned part of
> ptr */
> > +    and         t6, a0, t2                  /* unaligned part of lhs */
> > +    and         t5, a1, t2                  /* unaligned part of rhs */
> > +    sub         t6, t1, t6                  /* safe count to read from
> lhs */
> > +    sub         t5, t1, t5                  /* same, rhs */
> > +    vmsltu.vx   v28, v30, t6                /* mask for first part of
> lhs */
> > +    vmsltu.vx   v26, v30, t5                /* mask for first part of
> rhs */
> > +    vmv.v.x     v16, zero
> > +    vmv.v.x     v18, zero
> > +
> > +
> > +1:  blt         a2, t1, .Ltail
> > +    vmv.v.v     v0, v28                     /* lhs mask */
> > +    vle8.v      v2, (a0), v0.t              /* masked load from lhs */
> > +    vmseq.vx    v16, v2, zero, v0.t         /* check loaded bytes for
> null */
> > +    vmv.v.v     v0, v26                     /*       rhs mask */
> > +    vfirst.m    t2, v16                     /* get lhs check result */
> > +    bgez        t2, .Ltail                  /* can we safely check rest
> */
> > +    vle8.v      v4, (a1), v0.t              /*       masked load from
> rhs */
> > +    vmseq.vx    v18, v4, zero, v0.t         /*       check partial rhs
> */
> > +    vmnot.m     v0, v28                     /* mask for rest of lhs */
> > +    vfirst.m    t3, v18                     /* get check result */
> > +    bltz        t3, 2f                      /* test it */
> > +    bge         t3, t6, .Ltail
> > +
> > +    vmsleu.vx   v0, v30, t3                 /* select rest of string +
> null */
> > +    vmsne.vv    v0, v2, v4, v0.t            /* compare */
> > +    vfirst.m    t3, v0
> > +    bgez        t3, 3f
> > +    mv          a0, zero
> > +    ret
> > +3:  add a0, a0, t3
> > +    add a1, a1, t3
> > +    lbu t0, (a0)
> > +    lbu t1, (a1)
> > +.Ldiff:
> > +    sub a0, t0, t1
> > +    ret
> > +
> > +    /* ...no null terminator in first part of lhs or rhs */
> > +2:  vle8.v      v2, (a0), v0.t              /* load rest of lhs */
> > +    vmnot.m     v0, v26                     /* mask for rest of rhs */
> > +    vle8.v      v4, (a1), v0.t              /* load rest of rhs */
> > +    vmsne.vv    v0, v2, v4                  /* compare */
> > +    add         a0, a0, t1                  /* advance ptrs */
> > +    vfirst.m    t3, v0
> > +    add         a1, a1, t1
> > +    sub         a2, a2, t1
> > +    bltz        t3, 1b
> > +
> > +    sub t3, t3, t1  /* found a diff but we've already advanced a0 and
> a1 */
> > +    j 3b
> > +
> > +.Ltail:
> > +    beqz a2, 1f
> > +    addi a2, a2, -1
> > +    lbu t0, (a0)
> > +    lbu t1, (a1)
> > +    bne t0, t1, .Ldiff
> > +    addi a0, a0, 1
> > +    addi a1, a1, 1
> > +    bnez t0, .Ltail
> > +1:  mv a0, zero
> > +    ret
> > +
> > +
> > +.size strncmp, .-strncmp
> > +libc_hidden_builtin_def (strncmp)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S
> b/sysdeps/riscv/rv64/rvv/strncpy.S
> > new file mode 100644
> > index 0000000000..8b3a1e545c
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
> > @@ -0,0 +1,96 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl  strncpy
> > +.type   strncpy,@function
> > +
> > +/*
> > + *  optimized strcpy for riscv with vector extension
> > + *  assumptions:
> > + *  - vlenb is a power of 2
> > + *  - page size >= 2*vlenb
> > + */
> > +
> > +.align    2
> > +strncpy:
> > +    mv          t0, a0                    /* need to return dest so
> copy */
> > +
> > +    csrr        t1, vlenb                 /* find vlenb*2 */
> > +    add         t1, t1, t1
> > +
> > +    addi        t2, t1, -1                /* mask off unaligned part of
> ptr */
> > +    and         t2, a1, t2
> > +    beqz        t2, .Laligned
> > +
> > +    sub         t2, t1, t2                /* search to align the
> pointer */
> > +    vsetvli     zero, t2, e8, m2, tu, mu
> > +    vle8.v      v2, (a1)
> > +    vmseq.vx    v4, v2, zero
> > +    vmsif.m     v0, v4                    /* copy to dest */
> > +    vfirst.m    t3, v4
> > +    bgeu        t2, a2, .Ldest_full
> > +    vse8.v      v2, (t0), v0.t
> > +    bgez        t3, .Lterminator_found
> > +    add         t0, t0, t2
> > +    add         a1, a1, t2
> > +    sub         a2, a2, t2
> > +    beqz        a2, .Ldone
> > +
> > +.Laligned:
> > +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per
> pass */
> > +
> > +1:
> > +    vle8.v      v2, (a1)
> > +    add         a1, a1, t1
> > +    vmseq.vx    v4, v2, zero
> > +    vmsif.m     v0, v4
> > +    vfirst.m    t3, v4
> > +    bgeu        t1, a2, .Ldest_full
> > +    vse8.v      v2, (t0), v0.t
> > +    add         t0, t0, t1
> > +    sub         a2, a2, t1
> > +    bltz        t3, 1b
> > +    sub         t0, t0, t1
> > +
> > +.Lterminator_found:
> > +    addi        sp, sp, -16
> > +    sd          ra, 0(sp)
> > +    sd          a0, 8(sp)
> > +    add         a0, t0, t3
> > +    mv          a1, zero
> > +    sub         a2, a2, t3
> > +    jal         ra, memset
> > +    ld          ra, 0(sp)
> > +    ld          a0, 8(sp)
> > +    addi        sp, sp, 16
> > +.Ldone:
> > +    ret
> > +
> > +.Ldest_full:
> > +    vid.v       v6
> > +    vmsltu.vx   v4, v6, a2
> > +    vmand.mm    v0, v0, v4
> > +    vse8.v      v2, (t0), v0.t
> > +    ret
> > +
> > +.size   strncpy, .-strncpy
> > +libc_hidden_builtin_def (strncpy)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S
> b/sysdeps/riscv/rv64/rvv/strnlen.S
> > new file mode 100644
> > index 0000000000..6d7ee65c7a
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
> > @@ -0,0 +1,81 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <sysdep.h>
> > +
>
> Maybe use a generic implementation that issues memchr (which should be
> optimized
> using vector instructions) [3] ?  It would be a extra function call, but
> it should really
> help on both code size and icache pressure.
>
> [3]
> https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-6-adhemerval.zanella@linaro.org/
>
> > +.globl  __strnlen
> > +.type   __strnlen,@function
> > +
> > +/* vector optimized strnlen
> > + * assume it's safe to read to the end of the page
> > + * containing either a null terminator or the last byte of the count or
> both,
> > + * but not past it
> > + * assume page size >= vlenb*2
> > + */
> > +
> > +.align    2
> > +__strnlen:
> > +    mv          t4, a0               /* stash a copy of start for later
> */
> > +    beqz        a1, .LzeroCount
> > +
> > +    csrr        t1, vlenb            /* find vlenb*2 */
> > +    add         t1, t1, t1
> > +    addi        t2, t1, -1           /* mask off unaligned part of ptr
> */
> > +    and         t2, a1, a0
> > +    beqz        t2, .Laligned
> > +
> > +    sub         t2, t1, t2           /* search to align pointer to t1 */
> > +    bgeu        t2, a1, 2f           /* check it's safe */
> > +    mv          t2, a1               /* it's not! look as far as
> permitted */
> > +2:  vsetvli     t2, t2, e8, m2, ta, ma
> > +    vle8.v      v2, (a0)
> > +    vmseq.vx    v0, v2, zero
> > +    vfirst.m    t3, v0
> > +    bgez        t3, .Lfound
> > +    add         a0, a0, t2
> > +    sub         a1, a1, t2
> > +    bltu        a1, t1, .LreachedCount
> > +
> > +.Laligned:
> > +    vsetvli     zero, t1, e8, m2, ta, ma    /* do 2*vlenb bytes per
> pass */
> > +
> > +1:  vle8.v      v2, (a0)
> > +    sub         a1, a1, t1
> > +    vmseq.vx    v0, v2, zero
> > +    vfirst.m    t3, v0
> > +    bgez        t3, .Lfound
> > +    add         a0, a0, t1
> > +    bgeu        a1, t1, 1b
> > +.LreachedCount:
> > +    mv          t2, a1    /* in case 0 < a1 < t1 */
> > +    bnez        a1, 2b    /* if so, still t2 bytes to check, all safe */
> > +.LzeroCount:
> > +    sub         a0, a0, t4
> > +    ret
> > +
> > +.Lfound:        /* found the 0; subtract buffer start from current
> pointer */
> > +    add         a0, a0, t3 /* and add offset into fetched data */
> > +    sub         a0, a0, t4
> > +    ret
> > +
> > +.size   __strnlen, .-__strnlen
> > +weak_alias (__strnlen, strnlen)
> > +libc_hidden_builtin_def (__strnlen)
> > +libc_hidden_builtin_def (strnlen)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S
> b/sysdeps/riscv/rv64/rvv/strrchr.S
> > new file mode 100644
> > index 0000000000..4bef8a3b9c
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
> > @@ -0,0 +1,88 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <sysdep.h>
>
> It is really worth to add a strrchr optimization?  The generic
> implementation
> already calls strchr (which should be optimized).
>
> > +
> > +.globl  strrchr
> > +.type   strrchr,@function
> > +
> > +/*
> > + *  optimized strrchr for riscv with vector extension
> > + *  assumptions:
> > + *  - vlenb is a power of 2
> > + *  - page size >= 2*vlenb
> > + */
> > +
> > +.align    2
> > +strrchr:
> > +    mv          t5, a0    /* stash buffer ptr somewhere safe */
> > +    mv          a0, zero  /* result is nullptr unless we find better
> below */
> > +
> > +    csrr        t1, vlenb                /* determine vlenb*2 */
> > +    add         t1, t1, t1
> > +    addi        t2, t1, -1               /* mask off unaligned part of
> ptr */
> > +    and         t2, t5, t2
> > +    beqz        t2, .Laligned
> > +
> > +    sub         t2, t1, t2               /* search to align ptr to
> 2*vlenb */
> > +    vsetvli     t2, t2, e8, m2, ta, mu
> > +
> > +    vle8.v      v2, (t5)                 /* load data into v2(,v3) */
> > +    vmseq.vx    v4, v2, zero             /* check for null terminator */
> > +    vfirst.m    t4, v4                   /* grab its position, if any */
> > +    vmsbf.m     v0, v4                   /* select valid chars */
> > +    vmseq.vx    v0, v2, a1, v0.t         /* search for candidate byte */
> > +    vfirst.m    t3, v0                   /* grab its position, if any */
> > +    bltz        t3, 2f                   /* did we find a candidate? */
> > +
> > +3:  add         a0, t3, t5               /* we did! grab the address */
> > +    vmsof.m     v1, v0                   /* there might be more than
> one */
> > +    vmandn.mm   v0, v0, v1               /* so clear the one we just
> found */
> > +    vfirst.m    t3, v0                   /* is there another? */
> > +    bgez        t3, 3b
> > +
> > +2:  bgez        t4, .Ldone               /* did we see a null
> terminator? */
> > +    add         t5, t5, t2
> > +
> > +.Laligned:
> > +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per
> pass */
> > +
> > +1:  vle8.v      v2, (t5)
> > +    vmseq.vx    v4, v2, zero
> > +    vfirst.m    t4, v4
> > +    vmsbf.m     v0, v4
> > +    vmseq.vx    v0, v2, a1, v0.t
> > +    vfirst.m    t3, v0
> > +    bltz        t3, 2f
> > +
> > +3:  add         a0, t3, t5
> > +    vmsof.m     v1, v0
> > +    vmandn.mm   v0, v0, v1
> > +    vfirst.m    t3, v0
> > +    bgez        t3, 3b
> > +
> > +2:  add         t5, t5, t1
> > +    bltz        t4, 1b
> > +
> > +.Ldone:
> > +    ret
> > +
> > +.size   strrchr, .-strrchr
> > +libc_hidden_builtin_def (strrchr)
> > \ No newline at end of file
> > diff --git a/sysdeps/riscv/rv64/rvv/strspn.S
> b/sysdeps/riscv/rv64/rvv/strspn.S
> > new file mode 100644
> > index 0000000000..2b9af5cc2d
> > --- /dev/null
> > +++ b/sysdeps/riscv/rv64/rvv/strspn.S
> > @@ -0,0 +1,189 @@
> > +
> > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <sysdep.h>
> > +
> > +.globl  strspn
> > +.type   strspn,@function
> > +
> > +.globl  strcspn
> > +.type   strcspn,@function
> > +
> > +/*
> > + *  optimized strspn / strcspn for riscv with vector extension
> > + *  assumptions:
> > + *  - vlenb is a power of 2
> > + *  - page size >= 32
> > + *  strategy:
> > + *  - build a 256-bit table on the stack, where each elt is zero
> > + *    if encountering it should terminate computation and nonzero
> otherwise
> > + *  - use vectorised lookups into this to check 2*vlen elts at a time;
> > + *    this code is identical for strspan and strcspan and can be shared
> > + *
> > + *  note that while V mandates at least 128 bit wide regs,
> > + *  we are building a 256 bit lookup table
> > + *  therefore we use either LMUL=1 or 2 depending on what the target
> supports
> > + *  therefore we only use even vector register numbers,
> > + *  so everything still works if we go with LMUL=2
> > + */
> > +
>
> I wonder if we could adapt the generic implementation, so riscv only
> reimplements
> the vectorized search instead of all the boilerplace to generate the table
> and
> early tests.
>
> > +# -----------------------------
> > +
> > +.align    2
> > +
> > +strspn:
> > +    lbu         t0, 0(a1)
> > +    bnez        t0, .Lbuild_table
> > +    mv          a0, zero
> > +    ret
> > +
> > +.Lbuild_table:
> > +    mv          a6, a0 /* store incoming a0 */
> > +    li          t1, 32 /* want to deal with 256 bits at a time, so 32
> bytes */
> > +
> > +    vsetvli     zero, t1, e8, m1, tu, mu
> > +#if __riscv_v_min_vlen < 256
> > +    /* we want to build a 256-bit table, so use vlenb*2,
> > +     * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
> > +     * 'V' extension specifies a minimum vlen of 128 so this should
> cover
> > +     * all cases; we can skip the check if we know vlen >= 256 at
> compile time
> > +     */
> > +    csrr        t2, vlenb
> > +    bgeu        t2, t1, 1f
> > +    vsetvli     zero, t1, e8, m2, tu, mu
> > +1:
> > +#endif // __riscv_v_min_vlen
> > +
> > +    /* read one char from the charset at a time and write the correct
> bit
> > +     * in the lookup table; we could do SIMD iff we ever get an
> extension
> > +     * that provides some way of scattering bytes into a reg group
> > +     */
> > +    vmv.v.x     v16, zero       /* clear out table */
> > +    vmv.v.x     v8, zero        /* clear out v8 */
> > +    li          t3, 1
> > +    vmv.s.x     v8, t3          /* v8 now all zeroes except bottom byte
> */
> > +
> > +1:  vmv.v.x     v2, zero        /* clear out v2 */
> > +    addi        a1, a1, 1       /* advance charset ptr */
> > +    srli        t2, t0, 3       /* divide the byte we read earlier by 8
> */
> > +    vslideup.vx v2, v8, t2      /* v2 now 1 in the correct byte 0
> elsewhere */
> > +    vsll.vx     v2, v2, t0      /* v2 now 1 in the correct bit, 0
> elsewhere */
> > +    vor.vv      v16, v16, v2    /* or it in */
> > +    lbu         t0, 0(a1)       /* fetch next bute */
> > +    bnez        t0, 1b          /* if it's null, go round again */
> > +
> > +/*
> > + *   Table is now built in v16.
> > + *   Strategy:
> > + *   - fetch next t1 bytes from memory
> > + *   - vrgather on their values divided by 8 to get relevant bytes of
> table
> > + *   - shift right to get the correct bit into bit 1
> > + *   - and with 1, compare with expected terminator value, then check
> mask
> > + *     to see if we've found a terminator
> > + *
> > + *   Before we can begin, a0 needs to be t1-aligned, so that when we
> fetch
> > + *   the next t1 bytes - any of which may be the null terminator -
> > + *   we do not cross a page boundary and read unmapped memory. Therefore
> > + *   we have one read of however many bytes are needed to align a0,
> > + *   before the main loop.
> > + */
> > +
> > +.Lscan_table:
> > +    vmv.v.x     v8, t3              /* v8 now t1 bytes of 0x01 */
> > +
> > +    and         t2, a0, t1          /* mask to align to t1 */
> > +    beqz        t2, 2f              /* or skip if we're already aligned
> */
> > +    sub         t2, t1, t2          /* t2 now bytes to read to align to
> t1 */
> > +
> > +    vid.v       v2                  /* build mask instead of changing
> vl */
> > +    vmsltu.vx   v0, v2, t2          /* so we don't need to track LMUL */
> > +
> > +    vle8.v      v2, (a0), v0.t      /* load next bytes from input */
> > +    vsrl.vi     v4, v2, 3           /* divide by 8 */
> > +    vrgather.vv v6, v16, v4         /* corresponding bytes of bit table
> */
> > +    vsrl.vv     v6, v6, v2          /* shift correct bits to lsb */
> > +    vand.vv     v6, v6, v8          /* and with 1 to complete the
> lookups */
> > +    vmseq.vx    v4, v6, zero, v0.t  /* check to see if any 0s are
> present */
> > +    vfirst.m    t0, v4              /* index of the first 0, if any */
> > +    bgez        t0, .Lscan_end      /* if we found one, stop */
> > +    add         a0, a0, t2          /* advance by number of bytes we
> read */
> > +
> > +2:  add         a6, a6, t1     /* we'll advance a0 before the exit
> check */
> > +1:  vle8.v      v2, (a0)       /* as above but unmasked so t1 elts per
> pass */
> > +    add         a0, a0, t1
> > +
> > +    vsrl.vi     v4, v2, 3
> > +    vrgather.vv v6, v16, v4
> > +    vsrl.vv     v6, v6, v2
> > +    vand.vv     v6, v6, v8
> > +
> > +    vmseq.vx    v4, v6, zero
> > +    vfirst.m    t0, v4
> > +    bltz        t0, 1b
> > +
> > +.Lscan_end:
> > +    add         a0, a0, t0     /* calculate offset to terminating byte
> */
> > +    sub         a0, a0, a6
> > +    ret
> > +.size   strspn, .-strspn
> > +
> > +/* strcspn
> > + *
> > + * table build exactly as for strspn, except:
> > + * - the lookup table starts with all bits except bit 0 of byte 0 set
> > + * - we clear the corresponding bit for each byte in the charset
> > + * once table is built, we can reuse the scan code directly
> > + */
> > +
> > +strcspn:
> > +    lbu         t0, 0(a1)
> > +    beqz        t0, strlen   /* no rejections -> prefix is whole string
> */
> > +
> > +    mv          a6, a0
> > +    li          t1, 32
> > +
> > +    vsetvli     zero, t1, e8, m1, tu, mu
> > +#if __riscv_v_min_vlen < 256
> > +    csrr        t2, vlenb
> > +    bgeu        t2, t1, 1f
> > +    vsetvli     zero, t1, e8, m2, tu, mu
> > +1:
> > +#endif // __riscv_v_min_vlen
> > +
> > +    vmv.v.x     v8, zero
> > +    li          t3, 1           /* all bits clear except bit 0 of byte
> 0 */
> > +    vmv.s.x     v8, t3
> > +    vnot.v      v16, v8         /* v16 is the inverse of that */
> > +    li          t4, -1
> > +
> > +1:  vmv.v.x     v2, zero
> > +    addi        a1, a1, 1       /* advance charset ptr */
> > +    srli        t2, t0, 3       /* select correct bit in v2 */
> > +    vslideup.vx v2, v8, t2
> > +    vsll.vx     v2, v2, t0
> > +    vnot.v      v2, v2          /* invert */
> > +    vand.vv     v16, v16, v2    /* clear the relevant bit of table */
> > +    lbu         t0, 0(a1)
> > +    bnez        t0, 1b
> > +    j           .Lscan_table
> > +.size   strcspn, .-strcspn
> > +
> > +libc_hidden_builtin_def (strspn)
> > +libc_hidden_builtin_def (strcspn)
> > \ No newline at end of file
>
  
Adhemerval Zanella Netto Feb. 2, 2023, 2:26 p.m. UTC | #12
On 02/02/23 07:02, Sergei Lewis wrote:
> Thank you very much for the detailed review!
> 
>> > +#ifndef __riscv_strict_align
>> Would this be defined by compiler as predefine macro or is it just a debug
>> switch? If the later, I think it would be better to remove it.
> 
> The intent is to make use of the gcc feature in flight here: https://gcc.gnu.org/pipermail/gcc-patches/2023-January/610115.html <https://gcc.gnu.org/pipermail/gcc-patches/2023-January/610115.html> to detect the situation where the build environment has been configured to avoid unaligned access.

I am not sure this will be a good way forward to glibc, it means *another* variant
to build, check, and validate and, worse, it is not tied to any ABI/cpu but to
a compiler option.  I think it would be better to provide vectorized mem* and
str* that work indendently of the compiler option used.

> 
> 
>> It is really worth to add a strrchr optimization?  The generic implementation
>> already calls strchr (which should be optimized).
> 
> The performance win is actually quite significant; consider searching for the first space in a piece of text compared to the last space - reusing strchr in a loop as the generic implementation does would cause branches every few bytes of input, and essentially destroy all benefits gained from vectorisation. Worse, vectorised functions typically have a few instructions of setup/overhead before the work begins and these would likewise be repeated every few bytes of input.

Another option, which I will add on my default string refactor, it to use
strlen plus memrchr:

  char *strrchr (const char *s, int c)
  {
    return __memrchr (s, c, strlen(s) + 1);
  }

It would only 2 function calls and if the architecture provides optimized
strlen and memrchr, the performance overhead should be only the additional
functions call (which the advantage of less icache pressure).

> 
>> I wonder if we could adapt the generic implementation, so riscv only reimplements
>> the vectorized search instead of all the boilerplace to generate the table and
>> early tests.
> 
> The issue is that the table looks different for different implementations, and possibly even for different cases in the same implementation; e.g. some of the existing implementations use a 256-byte table with one byte per character rather than a 256-bit bitfield as I do here (and going forward we would potentially want such a path for riscv as well and select between them based on the length of the character set - common use in parsing will tend to produce very small character sets, but if we get a large one or potentially always depending on architecture, using indexed loads/stores will become faster than the bitfield approach I use here).

I recall that I tested using a 256-bit bitfield instead of 256-byte table, but
it incured in some overhead on most architecture (I might check again).

One option might to parametrize both the table generation and the table search,
but it might not be profitable for 

> 
> I am integrating all the other feedback, and will also work with your changes to the generic implementations - it looks like there is quite a bit of potential to reduce and simplify my changeset once yours goes in.
> 
> On Wed, Feb 1, 2023 at 5:38 PM Adhemerval Zanella Netto <adhemerval.zanella@linaro.org <mailto:adhemerval.zanella@linaro.org>> wrote:
> 
> 
> 
>     On 01/02/23 06:52, Sergei Lewis wrote:
>     > Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
>     > strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
>     > targeting the riscv "V" extension, version 1.0
>     >
>     > The vectorised implementations assume VLENB of at least 128 and at least 32
>     > registers (as mandated by the "V" extension spec). They also assume that
>     > VLENB is a power of two which is no larger than the page size, and (as
>     > vectorised code in glibc for other platforms does) that it is safe to read
>     > past null terminators / buffer ends provided one does not cross a page
>     > boundary.
>     >
>     > Signed-off-by: Sergei Lewis <slewis@rivosinc.com <mailto:slewis@rivosinc.com>>
> 
>     Some comments that might be useful since I am working the generic implementations
>     below.
> 
>     Also, I think it should be splitted with one implementation per patch, unless the
>     implementation is tied together (as for strchr/strchrnul for instance).  Does
>     the vectorized routine only work for rv64?
> 
>     > ---
>     >  sysdeps/riscv/rv64/rvv/Implies     |   2 +
>     >  sysdeps/riscv/rv64/rvv/memchr.S    | 127 +++++++++++++++++++
>     >  sysdeps/riscv/rv64/rvv/memcmp.S    |  93 ++++++++++++++
>     >  sysdeps/riscv/rv64/rvv/memcpy.S    | 154 +++++++++++++++++++++++
>     >  sysdeps/riscv/rv64/rvv/memmove.c   |  22 ++++
>     >  sysdeps/riscv/rv64/rvv/memset.S    |  89 ++++++++++++++
>     >  sysdeps/riscv/rv64/rvv/strchr.S    |  92 ++++++++++++++
>     >  sysdeps/riscv/rv64/rvv/strchrnul.c |  22 ++++
>     >  sysdeps/riscv/rv64/rvv/strcmp.S    | 108 +++++++++++++++++
>     >  sysdeps/riscv/rv64/rvv/strcpy.S    |  72 +++++++++++
>     >  sysdeps/riscv/rv64/rvv/strcspn.c   |  22 ++++
>     >  sysdeps/riscv/rv64/rvv/strlen.S    |  67 ++++++++++
>     >  sysdeps/riscv/rv64/rvv/strncmp.S   | 104 ++++++++++++++++
>     >  sysdeps/riscv/rv64/rvv/strncpy.S   |  96 +++++++++++++++
>     >  sysdeps/riscv/rv64/rvv/strnlen.S   |  81 +++++++++++++
>     >  sysdeps/riscv/rv64/rvv/strrchr.S   |  88 ++++++++++++++
>     >  sysdeps/riscv/rv64/rvv/strspn.S    | 189 +++++++++++++++++++++++++++++
>     >  17 files changed, 1428 insertions(+)
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/Implies
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/memchr.S
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/memcmp.S
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/memcpy.S
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/memmove.c
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/memset.S
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/strchr.S
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/strchrnul.c
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/strcmp.S
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/strcpy.S
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/strcspn.c
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/strlen.S
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/strncmp.S
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/strncpy.S
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/strnlen.S
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/strrchr.S
>     >  create mode 100644 sysdeps/riscv/rv64/rvv/strspn.S
>     >
>     > diff --git a/sysdeps/riscv/rv64/rvv/Implies b/sysdeps/riscv/rv64/rvv/Implies
>     > new file mode 100644
>     > index 0000000000..b07b4cb906
>     > --- /dev/null
>     > +++ b/sysdeps/riscv/rv64/rvv/Implies
>     > @@ -0,0 +1,2 @@
>     > +riscv/rv64/rvd
>     > +
>     > diff --git a/sysdeps/riscv/rv64/rvv/memchr.S b/sysdeps/riscv/rv64/rvv/memchr.S
>     > new file mode 100644
>     > index 0000000000..a7e32b8f25
>     > --- /dev/null
>     > +++ b/sysdeps/riscv/rv64/rvv/memchr.S
>     > @@ -0,0 +1,127 @@
>     > +
> 
>     Spurious new line at the start.  We also require a brief comment describing
>     the file contents for newer files.
> 
>     > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
> 
>     Not sure 2012 range fits here.
> 
>     > +
>     > +   This file is part of the GNU C Library.
>     > +
>     > +   The GNU C Library is free software; you can redistribute it and/or
>     > +   modify it under the terms of the GNU Lesser General Public
>     > +   License as published by the Free Software Foundation; either
>     > +   version 2.1 of the License, or (at your option) any later version.
>     > +
>     > +   The GNU C Library is distributed in the hope that it will be useful,
>     > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>     > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>     > +   Lesser General Public License for more details.
>     > +
>     > +   You should have received a copy of the GNU Lesser General Public
>     > +   License along with the GNU C Library.  If not, see
>     > +   <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>.  */
>     > +
>     > +
>     > +#include <sysdep.h>
>     > +
>     > +
>     > +/* Optimised memchr for riscv with vector extension
>     > + * Assumptions:
>     > + *    - cpu becomes bandwidth limited at or before
>     > + *            2 vector register sized read/write operations
>     > + *          + 2 scalar operations
>     > + *          + conditional branch
>     > + */
>     > +
>     > +.globl  memchr
>     > +.type   memchr,@function
>     > +
>     > +.align    2
>     > +memchr:
> 
>     We have the ENTRY macro for that.
> 
>     > +    beqz a2, .Lnot_found
> 
>     Maybe use the L macro here for local labels;
> 
>     > +    csrr    t1, vlenb
>     > +    bgeu    a2, t1, .Lvector_path   /* only use vector path if we're scanning
>     > +                                       at least vlenb bytes */
>     > +
>     > +#ifndef __riscv_strict_align
> 
>     Would this be defined by compiler as predefine macro or is it just a debug
>     switch? If the later, I think it would be better to remove it.
> 
>     > +    li a3, 8
>     > +    blt a2, a3, .Lbytewise
>     > +
>     > +    li      t1, 0x0101010101010101
>     > +    slli    a4, t1, 7     /* a4 = 0x8080808080808080 */
>     > +    mul     t2, a1, t1    /* entirety of t2 is now repeats of target character;
>     > +                             assume mul is at worst no worse than 3*(shift+OR),
>     > +                             otherwise do that instead */
>     > +
>     > +/*
>     > + * strategy:
>     > + * t4 = ((*a0) ^ t2)
>     > + *      - now t4 contains zero bytes if and only if next word of memory
>     > + *        had target character at those positions
>     > + *
>     > + * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
>     > + *      - all nonzero bytes of t4 become 0; zero bytes become 0x80
>     > + *
>     > + * if t4 is nonzero, find the index of the byte within it, add to a0 and return
>     > + * otherwise, loop
>     > + */
>     > +
>     > +1:
>     > +    ld t4, (a0)          /* t4 = load next 8 bytes */
>     > +    xor t4, t4, t2
>     > +    sub t5, t4, t1
>     > +    not t4, t4
>     > +    and t4, t5, t4
>     > +    and t4, t4, a4
>     > +    bnez t4, .Lbytewise  /* could use ctzw, mod+lookup or just binary chop
>     > +                            to locate byte of interest in t4 but profiling
>     > +                            shows these approaches are at best no better */
>     > +    addi a2, a2, -8
>     > +    addi a0, a0, 8
>     > +    bgeu a2, a3, 1b
>     > +    beqz a2, .Lnot_found
>     > +#endif // __riscv_strict_align
>     > +
>     > +/* too little data for a dword. mask calculation and branch mispredict costs
>     > +   make checking a word not worthwhile. degrade to bytewise search. */
>     > +
>     > +.Lbytewise:
>     > +    add t2, a0, a2
>     > +
>     > +1:
>     > +    lb t1, (a0)
>     > +    beq t1, a1, .Lfound
>     > +    addi a0, a0, 1
>     > +    blt a0, t2, 1b
>     > +
>     > +.Lnot_found:
>     > +    mv a0, zero
>     > +.Lfound:
>     > +    ret
>     > +
>     > +.Lvector_path:
>     > +    vsetvli t2, a2, e8, m2, ta, ma
>     > +
>     > +1:
>     > +    vle8.v      v2, (a0)
>     > +    vmseq.vx    v0, v2, a1
>     > +    vfirst.m    t3, v0
>     > +    bgez        t3, .Lvec_found
>     > +    add         a0, a0, t2
>     > +    sub         a2, a2, t2
>     > +    bge         a2, t2, 1b
>     > +    bnez        a2, 2f
>     > +    mv          a0, zero
>     > +    ret
>     > +
>     > +2:
>     > +    vsetvli t2, a2, e8, m2, ta, ma
>     > +    vle8.v      v2, (a0)
>     > +    vmseq.vx    v0, v2, a1
>     > +    vfirst.m    t3, v0
>     > +    bgez        t3, .Lvec_found
>     > +    mv          a0, zero
>     > +    ret
>     > +
>     > +.Lvec_found:
>     > +    add a0, a0, t3
>     > +    ret
>     > +
>     > +.size   memchr, .-memchr
>     > +libc_hidden_builtin_def (memchr)
>     > \ No newline at end of file
> 
>     Please add a newline.
> 
>     > diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S b/sysdeps/riscv/rv64/rvv/strcpy.S
>     > new file mode 100644
>     > index 0000000000..b21909d66f
>     > --- /dev/null
>     > +++ b/sysdeps/riscv/rv64/rvv/strcpy.S
>     > @@ -0,0 +1,72 @@
>     > +
>     > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
>     > +
>     > +   This file is part of the GNU C Library.
>     > +
>     > +   The GNU C Library is free software; you can redistribute it and/or
>     > +   modify it under the terms of the GNU Lesser General Public
>     > +   License as published by the Free Software Foundation; either
>     > +   version 2.1 of the License, or (at your option) any later version.
>     > +
>     > +   The GNU C Library is distributed in the hope that it will be useful,
>     > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>     > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>     > +   Lesser General Public License for more details.
>     > +
>     > +   You should have received a copy of the GNU Lesser General Public
>     > +   License along with the GNU C Library.  If not, see
>     > +   <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>.  */
>     > +
>     > +
> 
>     You can add a optimize stpcpy and use to implement strcpy on top of that
>     (as my generic proposal does [1]). ARMv6 does something similar [2]
> 
>     [1] https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-12-adhemerval.zanella@linaro.org/ <https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-12-adhemerval.zanella@linaro.org/>
>     [2] https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/arm/armv6/strcpy.S;h=e9f63a56c1c605a21b05f7ac21412585b0705171;hb=HEAD <https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/arm/armv6/strcpy.S;h=e9f63a56c1c605a21b05f7ac21412585b0705171;hb=HEAD>
> 
>     > +#include <sysdep.h>
>     > +
>     > +.globl  strcpy
>     > +.type   strcpy,@function
>     > +
>     > +/*
>     > + *  optimized strcpy for riscv with vector extension
>     > + *  assumptions:
>     > + *  - vlenb is a power of 2
>     > + *  - page size >= 2*vlenb
>     > + */
>     > +
>     > +.align    2
>     > +strcpy:
>     > +    mv          t0, a0                     /* copy dest so we can return it */
>     > +
>     > +    csrr        t1, vlenb                  /* find vlenb*2 */
>     > +    add         t1, t1, t1
>     > +
>     > +    addi        t2, t1, -1                 /* mask unaligned part of ptr */
>     > +    and         t2, a1, t2
>     > +    beqz        t2, .Laligned
>     > +
>     > +    sub         t2, t1, t2                 /* search enough to align ptr */
>     > +    vsetvli     t2, t2, e8, m2, tu, mu
>     > +    vle8.v      v2, (a1)
>     > +    vmseq.vx    v4, v2, zero
>     > +    vmsif.m     v0, v4                     /* copy but not past null */
>     > +    vfirst.m    t3, v4
>     > +    vse8.v      v2, (t0), v0.t
>     > +    bgez        t3, .Ldone
>     > +    add         t0, t0, t2
>     > +    add         a1, a1, t2
>     > +
>     > +.Laligned:
>     > +    vsetvli     zero, t1, e8, m2, ta, mu   /* now do 2*vlenb bytes per pass */
>     > +
>     > +1:
>     > +    vle8.v      v2, (a1)
>     > +    add         a1, a1, t1
>     > +    vmseq.vx    v4, v2, zero
>     > +    vmsif.m     v0, v4
>     > +    vfirst.m    t3, v4
>     > +    vse8.v      v2, (t0), v0.t
>     > +    add         t0, t0, t1
>     > +    bltz        t3, 1b
>     > +
>     > +.Ldone:
>     > +    ret
>     > +
>     > +.size   strcpy, .-strcpy
>     > +libc_hidden_builtin_def (strcpy)
>     > \ No newline at end of file
>     > diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c b/sysdeps/riscv/rv64/rvv/strcspn.c
>     > new file mode 100644
>     > index 0000000000..f0595a72fb
>     > --- /dev/null
>     > +++ b/sysdeps/riscv/rv64/rvv/strcspn.c
>     > @@ -0,0 +1,22 @@
>     > +
>     > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
>     > +
>     > +   This file is part of the GNU C Library.
>     > +
>     > +   The GNU C Library is free software; you can redistribute it and/or
>     > +   modify it under the terms of the GNU Lesser General Public
>     > +   License as published by the Free Software Foundation; either
>     > +   version 2.1 of the License, or (at your option) any later version.
>     > +
>     > +   The GNU C Library is distributed in the hope that it will be useful,
>     > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>     > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>     > +   Lesser General Public License for more details.
>     > +
>     > +   You should have received a copy of the GNU Lesser General Public
>     > +   License along with the GNU C Library.  If not, see
>     > +   <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>.  */
>     > +
>     > +
>     > +/* strcspn is implemented in strspn.S
>     > + */
>     > diff --git a/sysdeps/riscv/rv64/rvv/strlen.S b/sysdeps/riscv/rv64/rvv/strlen.S
>     > new file mode 100644
>     > index 0000000000..c77d500693
>     > --- /dev/null
>     > +++ b/sysdeps/riscv/rv64/rvv/strlen.S
>     > @@ -0,0 +1,67 @@
>     > +
>     > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
>     > +
>     > +   This file is part of the GNU C Library.
>     > +
>     > +   The GNU C Library is free software; you can redistribute it and/or
>     > +   modify it under the terms of the GNU Lesser General Public
>     > +   License as published by the Free Software Foundation; either
>     > +   version 2.1 of the License, or (at your option) any later version.
>     > +
>     > +   The GNU C Library is distributed in the hope that it will be useful,
>     > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>     > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>     > +   Lesser General Public License for more details.
>     > +
>     > +   You should have received a copy of the GNU Lesser General Public
>     > +   License along with the GNU C Library.  If not, see
>     > +   <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>.  */
>     > +
>     > +
>     > +#include <sysdep.h>
>     > +
>     > +.globl  strlen
>     > +.type   strlen,@function
>     > +
>     > +/*
>     > + *  optimized strlen for riscv with vector extension
>     > + *  assumptions:
>     > + *  - vlenb is a power of 2
>     > + *  - page size >= 2*vlenb
>     > + */
>     > +
>     > +.align    2
>     > +strlen:
>     > +    mv          t4, a0                    /* copy of buffer start */
>     > +    csrr        t1, vlenb                 /* find vlenb*2 */
>     > +    add         t1, t1, t1
>     > +    addi        t2, t1, -1                /* mask off unaligned part of ptr */
>     > +    and         t2, a0, t2
>     > +    beqz        t2, .Laligned
>     > +
>     > +    sub         t2, t1, t2                /* search fwd to align ptr */
>     > +    vsetvli     t2, t2, e8, m2, ta, ma
>     > +    vle8.v      v2, (a0)
>     > +    vmseq.vx    v0, v2, zero
>     > +    vfirst.m    t3, v0
>     > +    bgez        t3, .Lfound
>     > +    add         a0, a0, t2
>     > +
>     > +.Laligned:
>     > +    vsetvli     zero, t1, e8, m2, ta, ma  /* search 2*vlenb bytes per pass */
>     > +    add         t4, t4, t1
>     > +
>     > +1:
>     > +    vle8.v      v2, (a0)
>     > +    add         a0, a0, t1
>     > +    vmseq.vx    v0, v2, zero
>     > +    vfirst.m    t3, v0
>     > +    bltz        t3, 1b
>     > +
>     > +.Lfound:                                  /* found the 0; subtract          */
>     > +    sub         a0, a0, t4                /* buffer start from current ptr  */
>     > +    add         a0, a0, t3                /* and add offset into fetched    */
>     > +    ret                                   /* data to get length */
>     > +
>     > +.size   strlen, .-strlen
>     > +libc_hidden_builtin_def (strlen)
>     > \ No newline at end of file
>     > diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S b/sysdeps/riscv/rv64/rvv/strncmp.S
>     > new file mode 100644
>     > index 0000000000..863e5cb525
>     > --- /dev/null
>     > +++ b/sysdeps/riscv/rv64/rvv/strncmp.S
>     > @@ -0,0 +1,104 @@
>     > +
>     > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
>     > +
>     > +   This file is part of the GNU C Library.
>     > +
>     > +   The GNU C Library is free software; you can redistribute it and/or
>     > +   modify it under the terms of the GNU Lesser General Public
>     > +   License as published by the Free Software Foundation; either
>     > +   version 2.1 of the License, or (at your option) any later version.
>     > +
>     > +   The GNU C Library is distributed in the hope that it will be useful,
>     > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>     > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>     > +   Lesser General Public License for more details.
>     > +
>     > +   You should have received a copy of the GNU Lesser General Public
>     > +   License along with the GNU C Library.  If not, see
>     > +   <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>.  */
>     > +
>     > +
>     > +#include <sysdep.h>
>     > +
>     > +.globl  strncmp
>     > +.type   strncmp,@function
>     > +
>     > +.align    2
>     > +
>     > +/* as strcmp, but with added checks on a2 (max count)
>     > + */
>     > +
>     > +strncmp:
>     > +    csrr        t1, vlenb                   /* find vlenb*2 */
>     > +    add         t1, t1, t1
>     > +    blt         a2, t1, .Ltail              /* degrade if max < vlenb*2 */
>     > +    vsetvli     zero, t1, e8, m2, ta, mu
>     > +    vid.v       v30
>     > +    addi        t2, t1, -1                  /* mask unaligned part of ptr */
>     > +    and         t6, a0, t2                  /* unaligned part of lhs */
>     > +    and         t5, a1, t2                  /* unaligned part of rhs */
>     > +    sub         t6, t1, t6                  /* safe count to read from lhs */
>     > +    sub         t5, t1, t5                  /* same, rhs */
>     > +    vmsltu.vx   v28, v30, t6                /* mask for first part of lhs */
>     > +    vmsltu.vx   v26, v30, t5                /* mask for first part of rhs */
>     > +    vmv.v.x     v16, zero
>     > +    vmv.v.x     v18, zero
>     > +
>     > +
>     > +1:  blt         a2, t1, .Ltail
>     > +    vmv.v.v     v0, v28                     /* lhs mask */
>     > +    vle8.v      v2, (a0), v0.t              /* masked load from lhs */
>     > +    vmseq.vx    v16, v2, zero, v0.t         /* check loaded bytes for null */
>     > +    vmv.v.v     v0, v26                     /*       rhs mask */
>     > +    vfirst.m    t2, v16                     /* get lhs check result */
>     > +    bgez        t2, .Ltail                  /* can we safely check rest */
>     > +    vle8.v      v4, (a1), v0.t              /*       masked load from rhs */
>     > +    vmseq.vx    v18, v4, zero, v0.t         /*       check partial rhs */
>     > +    vmnot.m     v0, v28                     /* mask for rest of lhs */
>     > +    vfirst.m    t3, v18                     /* get check result */
>     > +    bltz        t3, 2f                      /* test it */
>     > +    bge         t3, t6, .Ltail
>     > +
>     > +    vmsleu.vx   v0, v30, t3                 /* select rest of string + null */
>     > +    vmsne.vv    v0, v2, v4, v0.t            /* compare */
>     > +    vfirst.m    t3, v0
>     > +    bgez        t3, 3f
>     > +    mv          a0, zero
>     > +    ret
>     > +3:  add a0, a0, t3
>     > +    add a1, a1, t3
>     > +    lbu t0, (a0)
>     > +    lbu t1, (a1)
>     > +.Ldiff:
>     > +    sub a0, t0, t1
>     > +    ret
>     > +
>     > +    /* ...no null terminator in first part of lhs or rhs */
>     > +2:  vle8.v      v2, (a0), v0.t              /* load rest of lhs */
>     > +    vmnot.m     v0, v26                     /* mask for rest of rhs */
>     > +    vle8.v      v4, (a1), v0.t              /* load rest of rhs */
>     > +    vmsne.vv    v0, v2, v4                  /* compare */
>     > +    add         a0, a0, t1                  /* advance ptrs */
>     > +    vfirst.m    t3, v0
>     > +    add         a1, a1, t1
>     > +    sub         a2, a2, t1
>     > +    bltz        t3, 1b
>     > +
>     > +    sub t3, t3, t1  /* found a diff but we've already advanced a0 and a1 */
>     > +    j 3b
>     > +
>     > +.Ltail:
>     > +    beqz a2, 1f
>     > +    addi a2, a2, -1
>     > +    lbu t0, (a0)
>     > +    lbu t1, (a1)
>     > +    bne t0, t1, .Ldiff
>     > +    addi a0, a0, 1
>     > +    addi a1, a1, 1
>     > +    bnez t0, .Ltail
>     > +1:  mv a0, zero
>     > +    ret
>     > +
>     > +
>     > +.size strncmp, .-strncmp
>     > +libc_hidden_builtin_def (strncmp)
>     > \ No newline at end of file
>     > diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S b/sysdeps/riscv/rv64/rvv/strncpy.S
>     > new file mode 100644
>     > index 0000000000..8b3a1e545c
>     > --- /dev/null
>     > +++ b/sysdeps/riscv/rv64/rvv/strncpy.S
>     > @@ -0,0 +1,96 @@
>     > +
>     > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
>     > +
>     > +   This file is part of the GNU C Library.
>     > +
>     > +   The GNU C Library is free software; you can redistribute it and/or
>     > +   modify it under the terms of the GNU Lesser General Public
>     > +   License as published by the Free Software Foundation; either
>     > +   version 2.1 of the License, or (at your option) any later version.
>     > +
>     > +   The GNU C Library is distributed in the hope that it will be useful,
>     > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>     > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>     > +   Lesser General Public License for more details.
>     > +
>     > +   You should have received a copy of the GNU Lesser General Public
>     > +   License along with the GNU C Library.  If not, see
>     > +   <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>.  */
>     > +
>     > +
>     > +#include <sysdep.h>
>     > +
>     > +.globl  strncpy
>     > +.type   strncpy,@function
>     > +
>     > +/*
>     > + *  optimized strcpy for riscv with vector extension
>     > + *  assumptions:
>     > + *  - vlenb is a power of 2
>     > + *  - page size >= 2*vlenb
>     > + */
>     > +
>     > +.align    2
>     > +strncpy:
>     > +    mv          t0, a0                    /* need to return dest so copy */
>     > +
>     > +    csrr        t1, vlenb                 /* find vlenb*2 */
>     > +    add         t1, t1, t1
>     > +
>     > +    addi        t2, t1, -1                /* mask off unaligned part of ptr */
>     > +    and         t2, a1, t2
>     > +    beqz        t2, .Laligned
>     > +
>     > +    sub         t2, t1, t2                /* search to align the pointer */
>     > +    vsetvli     zero, t2, e8, m2, tu, mu
>     > +    vle8.v      v2, (a1)
>     > +    vmseq.vx    v4, v2, zero
>     > +    vmsif.m     v0, v4                    /* copy to dest */
>     > +    vfirst.m    t3, v4
>     > +    bgeu        t2, a2, .Ldest_full
>     > +    vse8.v      v2, (t0), v0.t
>     > +    bgez        t3, .Lterminator_found
>     > +    add         t0, t0, t2
>     > +    add         a1, a1, t2
>     > +    sub         a2, a2, t2
>     > +    beqz        a2, .Ldone
>     > +
>     > +.Laligned:
>     > +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
>     > +
>     > +1:
>     > +    vle8.v      v2, (a1)
>     > +    add         a1, a1, t1
>     > +    vmseq.vx    v4, v2, zero
>     > +    vmsif.m     v0, v4
>     > +    vfirst.m    t3, v4
>     > +    bgeu        t1, a2, .Ldest_full
>     > +    vse8.v      v2, (t0), v0.t
>     > +    add         t0, t0, t1
>     > +    sub         a2, a2, t1
>     > +    bltz        t3, 1b
>     > +    sub         t0, t0, t1
>     > +
>     > +.Lterminator_found:
>     > +    addi        sp, sp, -16
>     > +    sd          ra, 0(sp)
>     > +    sd          a0, 8(sp)
>     > +    add         a0, t0, t3
>     > +    mv          a1, zero
>     > +    sub         a2, a2, t3
>     > +    jal         ra, memset
>     > +    ld          ra, 0(sp)
>     > +    ld          a0, 8(sp)
>     > +    addi        sp, sp, 16
>     > +.Ldone:
>     > +    ret
>     > +
>     > +.Ldest_full:
>     > +    vid.v       v6
>     > +    vmsltu.vx   v4, v6, a2
>     > +    vmand.mm <http://vmand.mm>    v0, v0, v4
>     > +    vse8.v      v2, (t0), v0.t
>     > +    ret
>     > +
>     > +.size   strncpy, .-strncpy
>     > +libc_hidden_builtin_def (strncpy)
>     > \ No newline at end of file
>     > diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S b/sysdeps/riscv/rv64/rvv/strnlen.S
>     > new file mode 100644
>     > index 0000000000..6d7ee65c7a
>     > --- /dev/null
>     > +++ b/sysdeps/riscv/rv64/rvv/strnlen.S
>     > @@ -0,0 +1,81 @@
>     > +
>     > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
>     > +
>     > +   This file is part of the GNU C Library.
>     > +
>     > +   The GNU C Library is free software; you can redistribute it and/or
>     > +   modify it under the terms of the GNU Lesser General Public
>     > +   License as published by the Free Software Foundation; either
>     > +   version 2.1 of the License, or (at your option) any later version.
>     > +
>     > +   The GNU C Library is distributed in the hope that it will be useful,
>     > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>     > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>     > +   Lesser General Public License for more details.
>     > +
>     > +   You should have received a copy of the GNU Lesser General Public
>     > +   License along with the GNU C Library.  If not, see
>     > +   <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>.  */
>     > +
>     > +
>     > +#include <sysdep.h>
>     > +
> 
>     Maybe use a generic implementation that issues memchr (which should be optimized
>     using vector instructions) [3] ?  It would be a extra function call, but it should really
>     help on both code size and icache pressure.
> 
>     [3] https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-6-adhemerval.zanella@linaro.org/ <https://patchwork.sourceware.org/project/glibc/patch/20230201170406.303978-6-adhemerval.zanella@linaro.org/>
> 
>     > +.globl  __strnlen
>     > +.type   __strnlen,@function
>     > +
>     > +/* vector optimized strnlen
>     > + * assume it's safe to read to the end of the page
>     > + * containing either a null terminator or the last byte of the count or both,
>     > + * but not past it
>     > + * assume page size >= vlenb*2
>     > + */
>     > +
>     > +.align    2
>     > +__strnlen:
>     > +    mv          t4, a0               /* stash a copy of start for later */
>     > +    beqz        a1, .LzeroCount
>     > +
>     > +    csrr        t1, vlenb            /* find vlenb*2 */
>     > +    add         t1, t1, t1
>     > +    addi        t2, t1, -1           /* mask off unaligned part of ptr */
>     > +    and         t2, a1, a0
>     > +    beqz        t2, .Laligned
>     > +
>     > +    sub         t2, t1, t2           /* search to align pointer to t1 */
>     > +    bgeu        t2, a1, 2f           /* check it's safe */
>     > +    mv          t2, a1               /* it's not! look as far as permitted */
>     > +2:  vsetvli     t2, t2, e8, m2, ta, ma
>     > +    vle8.v      v2, (a0)
>     > +    vmseq.vx    v0, v2, zero
>     > +    vfirst.m    t3, v0
>     > +    bgez        t3, .Lfound
>     > +    add         a0, a0, t2
>     > +    sub         a1, a1, t2
>     > +    bltu        a1, t1, .LreachedCount
>     > +
>     > +.Laligned:
>     > +    vsetvli     zero, t1, e8, m2, ta, ma    /* do 2*vlenb bytes per pass */
>     > +
>     > +1:  vle8.v      v2, (a0)
>     > +    sub         a1, a1, t1
>     > +    vmseq.vx    v0, v2, zero
>     > +    vfirst.m    t3, v0
>     > +    bgez        t3, .Lfound
>     > +    add         a0, a0, t1
>     > +    bgeu        a1, t1, 1b
>     > +.LreachedCount:
>     > +    mv          t2, a1    /* in case 0 < a1 < t1 */
>     > +    bnez        a1, 2b    /* if so, still t2 bytes to check, all safe */
>     > +.LzeroCount:
>     > +    sub         a0, a0, t4
>     > +    ret
>     > +
>     > +.Lfound:        /* found the 0; subtract buffer start from current pointer */
>     > +    add         a0, a0, t3 /* and add offset into fetched data */
>     > +    sub         a0, a0, t4
>     > +    ret
>     > +
>     > +.size   __strnlen, .-__strnlen
>     > +weak_alias (__strnlen, strnlen)
>     > +libc_hidden_builtin_def (__strnlen)
>     > +libc_hidden_builtin_def (strnlen)
>     > \ No newline at end of file
>     > diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S b/sysdeps/riscv/rv64/rvv/strrchr.S
>     > new file mode 100644
>     > index 0000000000..4bef8a3b9c
>     > --- /dev/null
>     > +++ b/sysdeps/riscv/rv64/rvv/strrchr.S
>     > @@ -0,0 +1,88 @@
>     > +
>     > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
>     > +
>     > +   This file is part of the GNU C Library.
>     > +
>     > +   The GNU C Library is free software; you can redistribute it and/or
>     > +   modify it under the terms of the GNU Lesser General Public
>     > +   License as published by the Free Software Foundation; either
>     > +   version 2.1 of the License, or (at your option) any later version.
>     > +
>     > +   The GNU C Library is distributed in the hope that it will be useful,
>     > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>     > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>     > +   Lesser General Public License for more details.
>     > +
>     > +   You should have received a copy of the GNU Lesser General Public
>     > +   License along with the GNU C Library.  If not, see
>     > +   <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>.  */
>     > +
>     > +
>     > +#include <sysdep.h>
> 
>     It is really worth to add a strrchr optimization?  The generic implementation
>     already calls strchr (which should be optimized).
> 
>     > +
>     > +.globl  strrchr
>     > +.type   strrchr,@function
>     > +
>     > +/*
>     > + *  optimized strrchr for riscv with vector extension
>     > + *  assumptions:
>     > + *  - vlenb is a power of 2
>     > + *  - page size >= 2*vlenb
>     > + */
>     > +
>     > +.align    2
>     > +strrchr:
>     > +    mv          t5, a0    /* stash buffer ptr somewhere safe */
>     > +    mv          a0, zero  /* result is nullptr unless we find better below */
>     > +
>     > +    csrr        t1, vlenb                /* determine vlenb*2 */
>     > +    add         t1, t1, t1
>     > +    addi        t2, t1, -1               /* mask off unaligned part of ptr */
>     > +    and         t2, t5, t2
>     > +    beqz        t2, .Laligned
>     > +
>     > +    sub         t2, t1, t2               /* search to align ptr to 2*vlenb */
>     > +    vsetvli     t2, t2, e8, m2, ta, mu
>     > +
>     > +    vle8.v      v2, (t5)                 /* load data into v2(,v3) */
>     > +    vmseq.vx    v4, v2, zero             /* check for null terminator */
>     > +    vfirst.m    t4, v4                   /* grab its position, if any */
>     > +    vmsbf.m     v0, v4                   /* select valid chars */
>     > +    vmseq.vx    v0, v2, a1, v0.t         /* search for candidate byte */
>     > +    vfirst.m    t3, v0                   /* grab its position, if any */
>     > +    bltz        t3, 2f                   /* did we find a candidate? */
>     > +
>     > +3:  add         a0, t3, t5               /* we did! grab the address */
>     > +    vmsof.m     v1, v0                   /* there might be more than one */
>     > +    vmandn.mm <http://vmandn.mm>   v0, v0, v1               /* so clear the one we just found */
>     > +    vfirst.m    t3, v0                   /* is there another? */
>     > +    bgez        t3, 3b
>     > +
>     > +2:  bgez        t4, .Ldone               /* did we see a null terminator? */
>     > +    add         t5, t5, t2
>     > +
>     > +.Laligned:
>     > +    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
>     > +
>     > +1:  vle8.v      v2, (t5)
>     > +    vmseq.vx    v4, v2, zero
>     > +    vfirst.m    t4, v4
>     > +    vmsbf.m     v0, v4
>     > +    vmseq.vx    v0, v2, a1, v0.t
>     > +    vfirst.m    t3, v0
>     > +    bltz        t3, 2f
>     > +
>     > +3:  add         a0, t3, t5
>     > +    vmsof.m     v1, v0
>     > +    vmandn.mm <http://vmandn.mm>   v0, v0, v1
>     > +    vfirst.m    t3, v0
>     > +    bgez        t3, 3b
>     > +
>     > +2:  add         t5, t5, t1
>     > +    bltz        t4, 1b
>     > +
>     > +.Ldone:
>     > +    ret
>     > +
>     > +.size   strrchr, .-strrchr
>     > +libc_hidden_builtin_def (strrchr)
>     > \ No newline at end of file
>     > diff --git a/sysdeps/riscv/rv64/rvv/strspn.S b/sysdeps/riscv/rv64/rvv/strspn.S
>     > new file mode 100644
>     > index 0000000000..2b9af5cc2d
>     > --- /dev/null
>     > +++ b/sysdeps/riscv/rv64/rvv/strspn.S
>     > @@ -0,0 +1,189 @@
>     > +
>     > +/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
>     > +
>     > +   This file is part of the GNU C Library.
>     > +
>     > +   The GNU C Library is free software; you can redistribute it and/or
>     > +   modify it under the terms of the GNU Lesser General Public
>     > +   License as published by the Free Software Foundation; either
>     > +   version 2.1 of the License, or (at your option) any later version.
>     > +
>     > +   The GNU C Library is distributed in the hope that it will be useful,
>     > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>     > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>     > +   Lesser General Public License for more details.
>     > +
>     > +   You should have received a copy of the GNU Lesser General Public
>     > +   License along with the GNU C Library.  If not, see
>     > +   <https://www.gnu.org/licenses/ <https://www.gnu.org/licenses/>>.  */
>     > +
>     > +
>     > +#include <sysdep.h>
>     > +
>     > +.globl  strspn
>     > +.type   strspn,@function
>     > +
>     > +.globl  strcspn
>     > +.type   strcspn,@function
>     > +
>     > +/*
>     > + *  optimized strspn / strcspn for riscv with vector extension
>     > + *  assumptions:
>     > + *  - vlenb is a power of 2
>     > + *  - page size >= 32
>     > + *  strategy:
>     > + *  - build a 256-bit table on the stack, where each elt is zero
>     > + *    if encountering it should terminate computation and nonzero otherwise
>     > + *  - use vectorised lookups into this to check 2*vlen elts at a time;
>     > + *    this code is identical for strspan and strcspan and can be shared
>     > + *
>     > + *  note that while V mandates at least 128 bit wide regs,
>     > + *  we are building a 256 bit lookup table
>     > + *  therefore we use either LMUL=1 or 2 depending on what the target supports
>     > + *  therefore we only use even vector register numbers,
>     > + *  so everything still works if we go with LMUL=2
>     > + */
>     > +
> 
>     I wonder if we could adapt the generic implementation, so riscv only reimplements
>     the vectorized search instead of all the boilerplace to generate the table and
>     early tests.
> 
>     > +# -----------------------------
>     > +
>     > +.align    2
>     > +
>     > +strspn:
>     > +    lbu         t0, 0(a1)
>     > +    bnez        t0, .Lbuild_table
>     > +    mv          a0, zero
>     > +    ret
>     > +
>     > +.Lbuild_table:
>     > +    mv          a6, a0 /* store incoming a0 */
>     > +    li          t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
>     > +
>     > +    vsetvli     zero, t1, e8, m1, tu, mu
>     > +#if __riscv_v_min_vlen < 256
>     > +    /* we want to build a 256-bit table, so use vlenb*2,
>     > +     * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
>     > +     * 'V' extension specifies a minimum vlen of 128 so this should cover
>     > +     * all cases; we can skip the check if we know vlen >= 256 at compile time
>     > +     */
>     > +    csrr        t2, vlenb
>     > +    bgeu        t2, t1, 1f
>     > +    vsetvli     zero, t1, e8, m2, tu, mu
>     > +1:
>     > +#endif // __riscv_v_min_vlen
>     > +
>     > +    /* read one char from the charset at a time and write the correct bit
>     > +     * in the lookup table; we could do SIMD iff we ever get an extension
>     > +     * that provides some way of scattering bytes into a reg group
>     > +     */
>     > +    vmv.v.x     v16, zero       /* clear out table */
>     > +    vmv.v.x     v8, zero        /* clear out v8 */
>     > +    li          t3, 1
>     > +    vmv.s.x     v8, t3          /* v8 now all zeroes except bottom byte */
>     > +
>     > +1:  vmv.v.x     v2, zero        /* clear out v2 */
>     > +    addi        a1, a1, 1       /* advance charset ptr */
>     > +    srli        t2, t0, 3       /* divide the byte we read earlier by 8 */
>     > +    vslideup.vx v2, v8, t2      /* v2 now 1 in the correct byte 0 elsewhere */
>     > +    vsll.vx     v2, v2, t0      /* v2 now 1 in the correct bit, 0 elsewhere */
>     > +    vor.vv      v16, v16, v2    /* or it in */
>     > +    lbu         t0, 0(a1)       /* fetch next bute */
>     > +    bnez        t0, 1b          /* if it's null, go round again */
>     > +
>     > +/*
>     > + *   Table is now built in v16.
>     > + *   Strategy:
>     > + *   - fetch next t1 bytes from memory
>     > + *   - vrgather on their values divided by 8 to get relevant bytes of table
>     > + *   - shift right to get the correct bit into bit 1
>     > + *   - and with 1, compare with expected terminator value, then check mask
>     > + *     to see if we've found a terminator
>     > + *
>     > + *   Before we can begin, a0 needs to be t1-aligned, so that when we fetch
>     > + *   the next t1 bytes - any of which may be the null terminator -
>     > + *   we do not cross a page boundary and read unmapped memory. Therefore
>     > + *   we have one read of however many bytes are needed to align a0,
>     > + *   before the main loop.
>     > + */
>     > +
>     > +.Lscan_table:
>     > +    vmv.v.x     v8, t3              /* v8 now t1 bytes of 0x01 */
>     > +
>     > +    and         t2, a0, t1          /* mask to align to t1 */
>     > +    beqz        t2, 2f              /* or skip if we're already aligned */
>     > +    sub         t2, t1, t2          /* t2 now bytes to read to align to t1 */
>     > +
>     > +    vid.v       v2                  /* build mask instead of changing vl */
>     > +    vmsltu.vx   v0, v2, t2          /* so we don't need to track LMUL */
>     > +
>     > +    vle8.v      v2, (a0), v0.t      /* load next bytes from input */
>     > +    vsrl.vi <http://vsrl.vi>     v4, v2, 3           /* divide by 8 */
>     > +    vrgather.vv v6, v16, v4         /* corresponding bytes of bit table */
>     > +    vsrl.vv     v6, v6, v2          /* shift correct bits to lsb */
>     > +    vand.vv     v6, v6, v8          /* and with 1 to complete the lookups */
>     > +    vmseq.vx    v4, v6, zero, v0.t  /* check to see if any 0s are present */
>     > +    vfirst.m    t0, v4              /* index of the first 0, if any */
>     > +    bgez        t0, .Lscan_end      /* if we found one, stop */
>     > +    add         a0, a0, t2          /* advance by number of bytes we read */
>     > +
>     > +2:  add         a6, a6, t1     /* we'll advance a0 before the exit check */
>     > +1:  vle8.v      v2, (a0)       /* as above but unmasked so t1 elts per pass */
>     > +    add         a0, a0, t1
>     > +
>     > +    vsrl.vi <http://vsrl.vi>     v4, v2, 3
>     > +    vrgather.vv v6, v16, v4
>     > +    vsrl.vv     v6, v6, v2
>     > +    vand.vv     v6, v6, v8
>     > +
>     > +    vmseq.vx    v4, v6, zero
>     > +    vfirst.m    t0, v4
>     > +    bltz        t0, 1b
>     > +
>     > +.Lscan_end:
>     > +    add         a0, a0, t0     /* calculate offset to terminating byte */
>     > +    sub         a0, a0, a6
>     > +    ret
>     > +.size   strspn, .-strspn
>     > +
>     > +/* strcspn
>     > + *
>     > + * table build exactly as for strspn, except:
>     > + * - the lookup table starts with all bits except bit 0 of byte 0 set
>     > + * - we clear the corresponding bit for each byte in the charset
>     > + * once table is built, we can reuse the scan code directly
>     > + */
>     > +
>     > +strcspn:
>     > +    lbu         t0, 0(a1)
>     > +    beqz        t0, strlen   /* no rejections -> prefix is whole string */
>     > +
>     > +    mv          a6, a0
>     > +    li          t1, 32
>     > +
>     > +    vsetvli     zero, t1, e8, m1, tu, mu
>     > +#if __riscv_v_min_vlen < 256
>     > +    csrr        t2, vlenb
>     > +    bgeu        t2, t1, 1f
>     > +    vsetvli     zero, t1, e8, m2, tu, mu
>     > +1:
>     > +#endif // __riscv_v_min_vlen
>     > +
>     > +    vmv.v.x     v8, zero
>     > +    li          t3, 1           /* all bits clear except bit 0 of byte 0 */
>     > +    vmv.s.x     v8, t3
>     > +    vnot.v      v16, v8         /* v16 is the inverse of that */
>     > +    li          t4, -1
>     > +
>     > +1:  vmv.v.x     v2, zero
>     > +    addi        a1, a1, 1       /* advance charset ptr */
>     > +    srli        t2, t0, 3       /* select correct bit in v2 */
>     > +    vslideup.vx v2, v8, t2
>     > +    vsll.vx     v2, v2, t0
>     > +    vnot.v      v2, v2          /* invert */
>     > +    vand.vv     v16, v16, v2    /* clear the relevant bit of table */
>     > +    lbu         t0, 0(a1)
>     > +    bnez        t0, 1b
>     > +    j           .Lscan_table
>     > +.size   strcspn, .-strcspn
>     > +
>     > +libc_hidden_builtin_def (strspn)
>     > +libc_hidden_builtin_def (strcspn)
>     > \ No newline at end of file
>
  
Sergei Lewis Feb. 2, 2023, 3:20 p.m. UTC | #13
>
>  I think it would be better to provide vectorized mem* and

str* that work indendently of the compiler option used.
>

A generic vectorized mem*/str* with no scalar fallback has no issues with
alignment and is actually smaller and simpler code as well. The awkwardness
here is performance of very small operations, which are a significant
portion of the calls to these functions in practice in the wild: for
operations much smaller than the vector length, a scalar implementation is
faster - but this is only true if it either makes no unaligned accesses, or
unaligned accesses are permitted and reasonably performant on the target,
which (as others have mentioned here) may not be the case on RISCV; and
there is a limit to how much we can check each invocation without paying
more for the checks than we save. RISCV vector length, though, may be quite
large, so basing the tradeoff on that with fallback to a scalar loop that
just processes a byte per iteration may also be prohibitive.

Using ifuncs would, of course, provide a way to address this once the
required support / plumbing is in place. I'll look at shelving
the microoptimisations until then and sticking to more generic code here.
Using the newly visible OP_T_THRES from your patchset may be the way
forward in the interim.


> Another option, which I will add on my default string refactor, it to use
> strlen plus memrchr:
>
>   char *strrchr (const char *s, int c)
>   {
>     return __memrchr (s, c, strlen(s) + 1);
>   }
>
> It would only 2 function calls and if the architecture provides optimized
> strlen and memrchr, the performance overhead should be only the additional
> functions call (which the advantage of less icache pressure).
>

This approach still means you're reading the entire s buffer twice in the
worst case: once to find the end, then again to scan it for c. It's not the
end of the world - often, s will be small and c present, and s will be in
cache for the second read, so the tradeoff is arguably less clear than
what's in the generic code now. I'll see if I can gather some more info on
usage in the wild.


> I recall that I tested using a 256-bit bitfield instead of 256-byte table,
> but
> it incured in some overhead on most architecture (I might check again).
>
One option might to parametrize both the table generation and the table
> search,
>

I'm using performance measurements here, of course; this sort of tradeoff
might be another one for ifuncs or even selected at runtime. What I suggest
might be slightly unfortunate here, though, is the creation of an internal
glibc api that forces one particular design choice on all platforms in all
situations.

Note that the cost of the table generation step is almost as important as
the cost of the scan - e.g. strtok() is implemented using these and
generally called in a tight loop by code in the wild; use of that directly
or similar patterns during parsing/tokenization means these functions are
typically invoked many times, frequently, and the standard library provides
no way for the table to be reused the table between the calls even though
the accept/reject chars remain the same. So it is likely that people
optimising glibc for different platforms will still want to provide
optimized paths for the table generation even if it is factored out into a
separate module.
  
Sergei Lewis Feb. 2, 2023, 3:35 p.m. UTC | #14
In general, I suggest caution with tradeoffs between function calls and
code reuse: on modern superscalar architectures, the cost of a mispredicted
branch can be huge in terms of the number of operations that could
otherwise get retired, and although the function invocation and return
themselves are completely predictable, each of these functions contains a
loop with an end condition that is data driven, so essentially random and
all but guaranteeing a mispredict per call in practice; folding
functionality into a single loop (for tiny pieces of code like these, for
small inputs) is a noticeable win over two calls each with its own loop.

On Thu, Feb 2, 2023 at 3:20 PM Sergei Lewis <slewis@rivosinc.com> wrote:

>  I think it would be better to provide vectorized mem* and
>
> str* that work indendently of the compiler option used.
>>
>
> A generic vectorized mem*/str* with no scalar fallback has no issues with
> alignment and is actually smaller and simpler code as well. The awkwardness
> here is performance of very small operations, which are a significant
> portion of the calls to these functions in practice in the wild: for
> operations much smaller than the vector length, a scalar implementation is
> faster - but this is only true if it either makes no unaligned accesses, or
> unaligned accesses are permitted and reasonably performant on the target,
> which (as others have mentioned here) may not be the case on RISCV; and
> there is a limit to how much we can check each invocation without paying
> more for the checks than we save. RISCV vector length, though, may be quite
> large, so basing the tradeoff on that with fallback to a scalar loop that
> just processes a byte per iteration may also be prohibitive.
>
> Using ifuncs would, of course, provide a way to address this once the
> required support / plumbing is in place. I'll look at shelving
> the microoptimisations until then and sticking to more generic code here.
> Using the newly visible OP_T_THRES from your patchset may be the way
> forward in the interim.
>
>
>> Another option, which I will add on my default string refactor, it to use
>> strlen plus memrchr:
>>
>>   char *strrchr (const char *s, int c)
>>   {
>>     return __memrchr (s, c, strlen(s) + 1);
>>   }
>>
>> It would only 2 function calls and if the architecture provides optimized
>> strlen and memrchr, the performance overhead should be only the additional
>> functions call (which the advantage of less icache pressure).
>>
>
> This approach still means you're reading the entire s buffer twice in the
> worst case: once to find the end, then again to scan it for c. It's not the
> end of the world - often, s will be small and c present, and s will be in
> cache for the second read, so the tradeoff is arguably less clear than
> what's in the generic code now. I'll see if I can gather some more info on
> usage in the wild.
>
>
>> I recall that I tested using a 256-bit bitfield instead of 256-byte
>> table, but
>> it incured in some overhead on most architecture (I might check again).
>>
> One option might to parametrize both the table generation and the table
>> search,
>>
>
> I'm using performance measurements here, of course; this sort of tradeoff
> might be another one for ifuncs or even selected at runtime. What I suggest
> might be slightly unfortunate here, though, is the creation of an internal
> glibc api that forces one particular design choice on all platforms in all
> situations.
>
> Note that the cost of the table generation step is almost as important as
> the cost of the scan - e.g. strtok() is implemented using these and
> generally called in a tight loop by code in the wild; use of that directly
> or similar patterns during parsing/tokenization means these functions are
> typically invoked many times, frequently, and the standard library provides
> no way for the table to be reused the table between the calls even though
> the accept/reject chars remain the same. So it is likely that people
> optimising glibc for different platforms will still want to provide
> optimized paths for the table generation even if it is factored out into a
> separate module.
>
  
Vineet Gupta Feb. 3, 2023, 12:13 a.m. UTC | #15
On 2/1/23 11:03, Andrew Waterman wrote:
>> +#ifndef __riscv_strict_align
> strict-align is not the right thing to check here.  As the RVA profile
> document explains, all RVA-compliant implementations must support
> misaligned loads and stores (so strict-align will be false), but they
> might execute extremely slowly (e.g., via trap and emulate), and so
> this approach will unduly penalize some implementations.

FWIW, the proposed __riscv_strict_align if generated can have 2 possible 
values:
  - 1 (explicit -mstrict-align used in build)
  - 2 cpu tune param indicated unaligned access is slow (like with 
trap-n-emulate)

So if theory code can still be written to cater to that.

-Vineet
  
Andrew Waterman Feb. 3, 2023, 12:51 a.m. UTC | #16
On Thu, Feb 2, 2023 at 4:13 PM Vineet Gupta <vineetg@rivosinc.com> wrote:
>
>
>
> On 2/1/23 11:03, Andrew Waterman wrote:
> >> +#ifndef __riscv_strict_align
> > strict-align is not the right thing to check here.  As the RVA profile
> > document explains, all RVA-compliant implementations must support
> > misaligned loads and stores (so strict-align will be false), but they
> > might execute extremely slowly (e.g., via trap and emulate), and so
> > this approach will unduly penalize some implementations.
>
> FWIW, the proposed __riscv_strict_align if generated can have 2 possible
> values:
>   - 1 (explicit -mstrict-align used in build)
>   - 2 cpu tune param indicated unaligned access is slow (like with
> trap-n-emulate)

Yeah, those semantics make sense.  It makes the "strict" name a little
misleading, though: "strict" suggests to me that misaligned accesses
are outright illegal.  So, it might be better to pick another name,
e.g. __riscv_avoid_misaligned.

>
> So if theory code can still be written to cater to that.

It had better cater to it in practice as well as in theory.  Standard
binary distributions need to be suitably generic, and they need to
heed the guidance in the RVA profile spec.  So this is OK as long as
the default continues to be to avoid misaligned accesses.  The fact
that GCC's default -mtune setting is to mark them as slow means this
is probably OK.

This may be Yet Another IFUNC Case: use misaligned accesses only if
known at runtime that they are fast, or when there's a routine
available that's optimized for a specific microarchitecture.
(Microarchitecture-specific IFUNCs would probably be more appropriate
for these routines, anyway, since they're pretty clearly tuned for a
particular machine.  For example, the trivial 7-instruction memcpy
loop recommended in the specification will perform well across a
broader range of vector machines.)

>
> -Vineet
  
Adhemerval Zanella Netto Feb. 3, 2023, 11:35 a.m. UTC | #17
On 02/02/23 12:20, Sergei Lewis wrote:
>      I think it would be better to provide vectorized mem* and 
> 
>     str* that work indendently of the compiler option used.
> 
> 
> A generic vectorized mem*/str* with no scalar fallback has no issues with alignment and is actually smaller and simpler code as well. The awkwardness here is performance of very small operations, which are a significant portion of the calls to these functions in practice in the wild: for operations much smaller than the vector length, a scalar implementation is faster - but this is only true if it either makes no unaligned accesses, or unaligned accesses are permitted and reasonably performant on the target, which (as others have mentioned here) may not be the case on RISCV; and there is a limit to how much we can check each invocation without paying more for the checks than we save. RISCV vector length, though, may be quite large, so basing the tradeoff on that with fallback to a scalar loop that just processes a byte per iteration may also be prohibitive.
> 
> Using ifuncs would, of course, provide a way to address this once the required support / plumbing is in place. I'll look at shelving the microoptimisations until then and sticking to more generic code here. Using the newly visible OP_T_THRES from your patchset may be the way forward in the interim.

Yes, this is similar to all other architecture that provides vector/simd 
instructions (aarch64 SVE for instance uses a similar strategy).  That's
not the issue in fact, the issues is having an extra compiler define
flag that affects binary distribution and incurs in extra maintenance
that is not tied to ABIs definition.  It means that we will need to also
build/check for RVV with __riscv_strict_align.

Is there a meaningful performance difference on always using the 
__riscv_strict_align code path to have a implementation that works whatever
the chip has fast unaligned memory access support?

The VLEN arbitrary upper bound and page size limit is also worrisome, as
Andrew has pointed out.  I would prefer to either have a generic implementation
that works without such limits (or assumes something more sane, like aarch64
with unaligned access that expects the minimum support 4k page size), or have
a way to fallback to default implementation if the criteria is not met.


>  
> 
>     Another option, which I will add on my default string refactor, it to use
>     strlen plus memrchr:
> 
>       char *strrchr (const char *s, int c)
>       {
>         return __memrchr (s, c, strlen(s) + 1);
>       }
> 
>     It would only 2 function calls and if the architecture provides optimized
>     strlen and memrchr, the performance overhead should be only the additional
>     functions call (which the advantage of less icache pressure).
> 
> 
> This approach still means you're reading the entire s buffer twice in the worst case: once to find the end, then again to scan it for c. It's not the end of the world - often, s will be small and c present, and s will be in cache for the second read, so the tradeoff is arguably less clear than what's in the generic code now. I'll see if I can gather some more info on usage in the wild.

Yes, ideally we will do the strrchr on only one pass over the string.  But the 
idea here is really try to use composability, specially when the symbol usage
has a very low probability to be a hotspot.

>  
> 
>     I recall that I tested using a 256-bit bitfield instead of 256-byte table, but
>     it incured in some overhead on most architecture (I might check again).
> 
>     One option might to parametrize both the table generation and the table search,
> 
> 
> I'm using performance measurements here, of course; this sort of tradeoff might be another one for ifuncs or even selected at runtime. What I suggest might be slightly unfortunate here, though, is the creation of an internal glibc api that forces one particular design choice on all platforms in all situations. 

I see the other way around, where it makes maintenance and remove complexity. To
give you an example, many memchr implementations uses a strategy to first calculate
the final address using the input address and size.  And some failed to take in
consideration that it might overflow [1].  We had to fixed on multiple 
implementations, where if we have a parametrized implementation we could just fix
on the default one.

Of course where the architecture optimization does not fit on the generic
framework, it is up to arch-maintainers to use a specialized one.  But the generic
implementation I proposing was used over the year by multiple architectures
which generates multiple arch-specific implementations.

[1] https://sourceware.org/bugzilla/show_bug.cgi?id=21182 

>  
> Note that the cost of the table generation step is almost as important as the cost of the scan - e.g. strtok() is implemented using these and generally called in a tight loop by code in the wild; use of that directly or similar patterns during parsing/tokenization means these functions are typically invoked many times, frequently, and the standard library provides no way for the table to be reused the table between the calls even though the accept/reject chars remain the same. So it is likely that people optimising glibc for different platforms will still want to provide optimized paths for the table generation even if it is factored out into a separate module.

I think using a bitfield for the table might be profitable for tight
called loops as you suggest, I will evaluate the usage on generic code.  
But again, my idea is try to parametrize only what the
architecture need to optimize, and even if compiler improves (through
autovectorization or other optimization pass) to avoid the need to such
arch-specific implementations.
  
Sergei Lewis Feb. 3, 2023, 2:04 p.m. UTC | #18
> > Is there a meaningful performance difference on always using the

> __riscv_strict_align code path


Minimum VLEN is 16 bytes, so for random operations we will average 8
iterations of a bytewise loop vs 2 of a word-oriented one.
That said, I'm currently working on a v2 patch that removes the scalar
fallback entirely - over a suite of random operation sizes, dropping the
word-based loop is more expensive than just not having the fallback at all.
So these patterns will all go away and the code will look much more like
what's in the ISA manual.


> The VLEN arbitrary upper bound and page size limit is also worrisome, as
> Andrew has pointed out.  I would prefer to either have a generic
> implementation
> that works without such limits
>

I realise I have not responded there yet - I'm certainly not ignoring this,
but investigating options before I commit; e.g. another option might be to
gate the affected code behind a compile time vlen check, and use fault only
first loads as Andrew suggests where there was not enough information
provided at compile time to prove the approach is safe.

These decisions will all become much more straightforward with ifunc
support - a generic version for the most common situation and runtime
selection of more specific versions would resolve all these issues and also
open the gates for people working on widely different implementations to
easily provide their own versions of as many or as few of these functions
as needed - and I do expect there will be a number of these, since the
architecture is super flexible and the ecosystem already looking quite
fragmented. Accordingly, I am also investigating what will be involved in
getting ifuncs support in place.
  
Sergei Lewis Feb. 6, 2023, 12:49 p.m. UTC | #19
FWIW it turns out that IFUNC resolution is supported by gcc for riscv
targets where all the underlying tools support them (which, in current
riscv-gnu-toolchain, they do). Support is detected and enabled here:
https://github.com/gcc-mirror/gcc/blob/master/gcc/configure.ac#L3027

Having a stable finalised kernel/user interface for caps would help, but
for this purpose a few CSR reads should hopefully be enough to make sane
decisions.
  
Yun Hsiang May 3, 2023, 2:11 a.m. UTC | #20
On Wed, Feb 01, 2023 at 09:52:32AM +0000, Sergei Lewis wrote:
> 
> Initial implementations of memchr, memcmp, memcpy, memmove, memset, strchr,
> strcmp, strcpy, strlen, strncmp, strncpy, strnlen, strrchr, strspn
> targeting the riscv "V" extension, version 1.0
> 
> The vectorised implementations assume VLENB of at least 128 and at least 32
> registers (as mandated by the "V" extension spec). They also assume that
> VLENB is a power of two which is no larger than the page size, and (as
> vectorised code in glibc for other platforms does) that it is safe to read
> past null terminators / buffer ends provided one does not cross a page
> boundary.
I've tried to apply these patches to run benchtests & tests, but I
encountered some errors at runtime while running string/test-*. Do
these patches depend on others?

> /* ignore */

In strnlen implementation.
> +#include <sysdep.h>
> +
> +.globl  __strnlen
> +.type   __strnlen,@function
> +
> +/* vector optimized strnlen
> + * assume it's safe to read to the end of the page
> + * containing either a null terminator or the last byte of the count or both,
> + * but not past it
> + * assume page size >= vlenb*2
> + */
> +
> +.align    2
> +__strnlen:
> +    mv          t4, a0               /* stash a copy of start for later */
> +    beqz        a1, .LzeroCount
> +
> +    csrr        t1, vlenb            /* find vlenb*2 */
> +    add         t1, t1, t1
> +    addi        t2, t1, -1           /* mask off unaligned part of ptr */
> +    and         t2, a1, a0
Should this line be `and t2, t2, a0`?
> +    beqz        t2, .Laligned
> +
> +    sub         t2, t1, t2           /* search to align pointer to t1 */
> +    bgeu        t2, a1, 2f           /* check it's safe */
> +    mv          t2, a1               /* it's not! look as far as permitted */
> +2:  vsetvli     t2, t2, e8, m2, ta, ma
> +    vle8.v      v2, (a0)
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t2
> +    sub         a1, a1, t2
> +    bltu        a1, t1, .LreachedCount
> +
> +.Laligned:
> +    vsetvli     zero, t1, e8, m2, ta, ma    /* do 2*vlenb bytes per pass */
> +
> +1:  vle8.v      v2, (a0)
> +    sub         a1, a1, t1
If a1(maxlen) is smaller than t1(vlenb*2) in the first loop,
a1(maxlen) will become a negative value.
Then strnlen might get the wrong result.
> +    vmseq.vx    v0, v2, zero
> +    vfirst.m    t3, v0
> +    bgez        t3, .Lfound
> +    add         a0, a0, t1
> +    bgeu        a1, t1, 1b
> +.LreachedCount:
> +    mv          t2, a1    /* in case 0 < a1 < t1 */
> +    bnez        a1, 2b    /* if so, still t2 bytes to check, all safe */
> +.LzeroCount:
> +    sub         a0, a0, t4
> +    ret
> +
> +.Lfound:        /* found the 0; subtract buffer start from current pointer */
> +    add         a0, a0, t3 /* and add offset into fetched data */
> +    sub         a0, a0, t4
> +    ret
> +
> +.size   __strnlen, .-__strnlen
> +weak_alias (__strnlen, strnlen)
> +libc_hidden_builtin_def (__strnlen)
> +libc_hidden_builtin_def (strnlen)
  

Patch

diff --git a/sysdeps/riscv/rv64/rvv/Implies b/sysdeps/riscv/rv64/rvv/Implies
new file mode 100644
index 0000000000..b07b4cb906
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/Implies
@@ -0,0 +1,2 @@ 
+riscv/rv64/rvd
+
diff --git a/sysdeps/riscv/rv64/rvv/memchr.S b/sysdeps/riscv/rv64/rvv/memchr.S
new file mode 100644
index 0000000000..a7e32b8f25
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/memchr.S
@@ -0,0 +1,127 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+
+/* Optimised memchr for riscv with vector extension
+ * Assumptions:
+ *    - cpu becomes bandwidth limited at or before
+ *            2 vector register sized read/write operations
+ *          + 2 scalar operations
+ *          + conditional branch
+ */
+
+.globl  memchr
+.type   memchr,@function
+
+.align    2
+memchr:
+    beqz a2, .Lnot_found
+    csrr    t1, vlenb
+    bgeu    a2, t1, .Lvector_path   /* only use vector path if we're scanning
+                                       at least vlenb bytes */
+
+#ifndef __riscv_strict_align
+    li a3, 8
+    blt a2, a3, .Lbytewise
+
+    li      t1, 0x0101010101010101
+    slli    a4, t1, 7     /* a4 = 0x8080808080808080 */
+    mul     t2, a1, t1    /* entirety of t2 is now repeats of target character;
+                             assume mul is at worst no worse than 3*(shift+OR),
+                             otherwise do that instead */
+
+/*
+ * strategy:
+ * t4 = ((*a0) ^ t2)
+ *      - now t4 contains zero bytes if and only if next word of memory
+ *        had target character at those positions
+ *
+ * t4 = ((t4-0x0101010101010101) & ~t4) & 0x8080808080808080
+ *      - all nonzero bytes of t4 become 0; zero bytes become 0x80
+ *
+ * if t4 is nonzero, find the index of the byte within it, add to a0 and return
+ * otherwise, loop
+ */
+
+1:
+    ld t4, (a0)          /* t4 = load next 8 bytes */
+    xor t4, t4, t2
+    sub t5, t4, t1
+    not t4, t4
+    and t4, t5, t4
+    and t4, t4, a4
+    bnez t4, .Lbytewise  /* could use ctzw, mod+lookup or just binary chop
+                            to locate byte of interest in t4 but profiling
+                            shows these approaches are at best no better */
+    addi a2, a2, -8
+    addi a0, a0, 8
+    bgeu a2, a3, 1b
+    beqz a2, .Lnot_found
+#endif // __riscv_strict_align
+
+/* too little data for a dword. mask calculation and branch mispredict costs
+   make checking a word not worthwhile. degrade to bytewise search. */
+
+.Lbytewise:
+    add t2, a0, a2
+
+1:
+    lb t1, (a0)
+    beq t1, a1, .Lfound
+    addi a0, a0, 1
+    blt a0, t2, 1b
+
+.Lnot_found:
+    mv a0, zero
+.Lfound:
+    ret
+
+.Lvector_path:
+    vsetvli t2, a2, e8, m2, ta, ma
+
+1:
+    vle8.v      v2, (a0)
+    vmseq.vx    v0, v2, a1
+    vfirst.m    t3, v0
+    bgez        t3, .Lvec_found
+    add         a0, a0, t2
+    sub         a2, a2, t2
+    bge         a2, t2, 1b
+    bnez        a2, 2f
+    mv          a0, zero
+    ret
+
+2:
+    vsetvli t2, a2, e8, m2, ta, ma
+    vle8.v      v2, (a0)
+    vmseq.vx    v0, v2, a1
+    vfirst.m    t3, v0
+    bgez        t3, .Lvec_found
+    mv          a0, zero
+    ret
+
+.Lvec_found:
+    add a0, a0, t3
+    ret
+
+.size   memchr, .-memchr
+libc_hidden_builtin_def (memchr)
\ No newline at end of file
diff --git a/sysdeps/riscv/rv64/rvv/memcmp.S b/sysdeps/riscv/rv64/rvv/memcmp.S
new file mode 100644
index 0000000000..a945753a5f
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/memcmp.S
@@ -0,0 +1,93 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+/* Optimised memcmp for riscv with vector extension
+ */
+
+.globl  memcmp
+.type   memcmp,@function
+
+.align    2
+
+memcmp:
+    mv t2, zero
+    beqz a2, .Ldone
+
+    li          t1, 5            /* scalar path cheaper for 1-4 elts */
+    bltu        a2, t1, .Lscalar
+
+    /* main loop, vlenb*2 elts at a time */
+    vsetvli     t1, a2, e8, m2, ta, ma
+
+1:
+    vle8.v      v2, (a0)        /* load elts */
+    vle8.v      v4, (a1)
+    vmsne.vv    v0, v2, v4      /* compare */
+    vfirst.m    t3, v0
+    bgez        t3, .Lvec_diff  /* found a difference ? */
+    add         a0, a0, t1      /* not yet, advance everything */
+    add         a1, a1, t1
+    sub         a2, a2, t1
+    bgeu        a2, t1, 1b
+
+    bnez        a2, .Ltail
+    mv          a0, zero
+    ret
+
+.Ltail:
+    /* handle tail. we know a2 < vlenb*2 so just load and compare the lot */
+    vsetvli     t1, a2, e8, m2, ta, ma
+    vle8.v      v2, (a0)
+    vle8.v      v4, (a1)
+    vmsne.vv    v0, v2, v4
+    vfirst.m    t3, v0
+    bgez        t3, .Lvec_diff
+    mv          a0, zero         /* no diff found */
+    ret
+
+.Lvec_diff:         /* v2, v4 differ at elt t3 */
+    add a0, a0, t3
+    add a1, a1, t3
+    lbu t0, (a0)
+    lbu t1, (a1)
+    sub a0, t0, t1
+    ret
+
+.Lscalar:
+    add  t3, a0, a2
+
+1:
+    lbu t0, (a0)
+    lbu t1, (a1)
+    sub t2, t0, t1
+    bnez t2, .Ldone
+    addi a0, a0, 1
+    addi a1, a1, 1
+    bltu a0, t3, 1b
+
+.Ldone:
+    mv a0, t2
+    ret
+
+
+.size memcmp, .-memcmp
+libc_hidden_builtin_def (memcmp)
\ No newline at end of file
diff --git a/sysdeps/riscv/rv64/rvv/memcpy.S b/sysdeps/riscv/rv64/rvv/memcpy.S
new file mode 100644
index 0000000000..7b37ec285d
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/memcpy.S
@@ -0,0 +1,154 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+/* Optimised memcpy and memmove for riscv with vector extension
+ */
+
+.globl  memcpy
+.type   memcpy,@function
+.globl  memmove
+.type   memmove,@function
+
+.align    2
+memmove:
+    bge     a0, a1, .Lmemcpy_rev
+
+memcpy:
+.Lmemcpy_fwd:
+    mv      t0, a0          /* t0 = preserve a0 so we can return it */
+    csrr    t2, vlenb       /* t2 = number of bytes per vectorised copy op */
+    slli    t5, t2,  1      /* t5 = number of bytes per loop */
+    addi    t3, t5, -1      /* generate mask */
+    not     t4, t3
+    and     t4, a2, t4      /* t4 = bytes copied in vectorised pass */
+
+    beqz    t4, .Lscalar_fwd    /* size too small for even one pass? */
+
+    and    a2, a2, t3           /* a2 = bytes still left to copy after pass */
+    add    t4, t4, a1           /* t4 = src at end of vectorised pass */
+
+1:
+    vl2r.v  v2, (a1)            /* load, advance source */
+    add     a1, a1, t5
+    vs2r.v  v2, (t0)            /* store, advance dest */
+    add     t0, t0, t5
+    bltu    a1, t4, 1b          /* src at end? */
+
+    bltu    a2, t2, .Lscalar_fwd /* should we do one more vec load/store? */
+    vl1r.v  v2, (a1)
+    sub     a2, a2, t2
+    add     a1, a1, t2
+    vs1r.v  v2, (t0)
+    add     t0, t0, t2
+
+.Lscalar_fwd:
+    bnez    a2, .Lnobail
+.Lbail:
+    ret
+.Lnobail:
+
+#ifndef __riscv_strict_align
+    addi    t2, zero, 4
+    bltu    a2, t2, .Lsingle_bytes
+1:
+    lw      t3, 0(a1)
+    addi    a1, a1, 4
+    sw      t3, 0(t0)
+    addi    t0, t0, 4
+    addi    a2, a2, -4
+    bgeu    a2, t2, 1b
+#endif // __riscv_strict_align
+
+.Lsingle_bytes:
+    beqz    a2, .Lbail
+    add     a2, a2, a1          /* a2 = src + remaining size */
+1:
+    lb      t1, 0(a1)
+    sb      t1, 0(t0)
+    addi    a1, a1, 1
+    addi    t0, t0, 1
+    bltu    a1, a2, 1b
+    ret
+.size   memcpy,     .-memcpy
+
+
+.Lmemcpy_rev:
+    beq     a0, a1, .Lmemcpy_rev_done
+    add     t0, a0, a2          /* t0 = dest so we can return a0=dest later */
+    add     t6, a1, a2          /* dest and src both point to byte */
+                                /* immediately after end of buffer */
+
+    csrr    t2, vlenb           /* t2 = number of bytes per pass */
+    slli    t5, t2,  1          /* t5 = number of bytes per entire loop */
+    addi    t3, t5, -1          /* t3 = (bytes per loop) mask */
+    not     t4, t3              /* generate mask for bytes processed by loop */
+    and     t4, a2, t4          /* t4 = bytes copied in vectorised pass */
+
+    beqz    t4, .Lscalar_rev    /* size too small for even one pass? */
+
+    and    a2, a2, t3           /* a2 = bytes still left to copy after pass */
+    sub    t4, t6, t4           /* t4 = src at end of vectorised pass */
+
+1:
+    sub     t6, t6, t5
+    sub     t0, t0, t5
+    vl2r.v  v2, (t6)            /* load, advance source */
+    vs2r.v  v2, (t0)            /* store, advance dest */
+    bgtu    t6, t4, 1b          /* src at end? */
+
+    bltu    a2, t2, .Lscalar_rev /* should we do one more vec load/store? */
+    sub     t6, t6, t2
+    sub     t0, t0, t2
+    sub     a2, a2, t2
+    vl1r.v  v2, (t6)
+    vs1r.v  v2, (t0)
+
+.Lscalar_rev:
+#ifndef __riscv_strict_align
+    beqz    a2, .Lbail
+
+    addi    t2, zero, 4
+    bltu    a2, t2, 2f
+1:
+    addi    t6, t6, -4
+    addi    t0, t0, -4
+    addi    a2, a2, -4
+    lw      t3, 0(t6)
+    sw      t3, 0(t0)
+    bgeu    a2, t2, 1b
+2:
+#endif // __riscv_strict_align
+
+    beqz    a2, .Lbail
+1:
+    addi    t6, t6, -1
+    addi    t0, t0, -1
+    lb      t1, 0(t6)
+    sb      t1, 0(t0)
+    bgtu    t0, a0, 1b
+
+.Lmemcpy_rev_done:
+    ret
+
+.size   memmove, .-memmove
+libc_hidden_builtin_def (memcpy)
+libc_hidden_builtin_def (memmove)
\ No newline at end of file
diff --git a/sysdeps/riscv/rv64/rvv/memmove.c b/sysdeps/riscv/rv64/rvv/memmove.c
new file mode 100644
index 0000000000..47734854f9
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/memmove.c
@@ -0,0 +1,22 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+/* memmove is implemented in memcpy.S
+ */
diff --git a/sysdeps/riscv/rv64/rvv/memset.S b/sysdeps/riscv/rv64/rvv/memset.S
new file mode 100644
index 0000000000..6f82c542b1
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/memset.S
@@ -0,0 +1,89 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+
+/* Optimised memset for riscv with vector extension
+ */
+
+.globl  memset
+.type   memset,@function
+
+.align    2
+memset:
+    mv      t0, a0                  /* t0 = dest so we can return a0 later */
+    vsetvli t2, a2, e8, m2, ta, ma  /* t2 = elts per copy */
+    beqz    t2, .Lscalar
+
+    vmv.v.x v2, a1                  /* splat value across v2 */
+
+    slli    t3, t2, 1
+    bgtu    t3, a2, .Lsinglestore
+
+1:
+    vse8.v  v2, (t0)
+    add     t0, t0, t2
+    vse8.v  v2, (t0)
+    add     t0, t0, t2
+    sub     a2, a2, t3
+    bgeu    a2, t3, 1b
+    bgeu    a2, t2, .Lsinglestore
+    bnez    a2, .Lscalar
+
+.Lbail:
+    ret
+
+.Lsinglestore:
+    bgtu    t2, a2, .Lscalar
+    vse8.v  v2, (t0)
+    add     t0, t0, t2
+    sub     a2, a2, t2
+
+.Lscalar:
+    beqz    a2, .Lbail
+
+#ifndef __riscv_strict_align
+    slli    t2, a1, 8
+    or      a1, a1, t2
+    slli    t2, a1, 16
+    or      a1, a1, t2
+
+    addi    t2, zero, 4
+    bltu    a2, t2, 2f
+
+1:
+    sw      a1, 0(t0)
+    addi    t0, t0, 4
+    addi    a2, a2, -4
+    bgeu    a2, t2, 1b
+2:
+    beqz    a2, .Lbail
+#endif // __riscv_strict_align
+
+    add     a2, a2, t0
+1:
+    sb      a1, 0(t0)
+    addi    t0, t0, 1
+    bltu    t0, a2, 1b
+    ret
+
+.size   memset,     .-memset
+libc_hidden_builtin_def (memset)
\ No newline at end of file
diff --git a/sysdeps/riscv/rv64/rvv/strchr.S b/sysdeps/riscv/rv64/rvv/strchr.S
new file mode 100644
index 0000000000..0b37174c55
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/strchr.S
@@ -0,0 +1,92 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+.globl  strchr
+.type   strchr,@function
+
+.globl  __strchrnul
+.type   __strchrnul,@function
+
+/*
+ *  optimized strchr for riscv with vector extension
+ *  assumptions:
+ *  - vlenb is a power of 2
+ *  - page size >= 2*vlenb
+ */
+
+.align    2
+__strchrnul:
+    li t5, -1
+    j  1f
+
+strchr:
+    mv          t5, zero
+1:  csrr        t1, vlenb              /* find vlenb*2 */
+    add         t1, t1, t1
+    addi        t2, t1, -1             /* mask off unaligned part of pointer */
+    and         t2, a0, t2
+    beqz        t2, .Laligned
+
+    sub         t2, t1, t2             /* search however many bytes
+                                          are needed to align the pointer */
+    vsetvli     t2, t2, e8, m2, ta, mu
+
+    vle8.v      v2, (a0)               /* load data into v2(,v3) */
+    vmseq.vx    v4, v2, zero
+    vfirst.m    t4, v4
+    vmsbf.m     v0, v4
+    vmseq.vx    v0, v2, a1, v0.t
+    vfirst.m    t3, v0
+    bgez        t3, .Lfound
+    bgez        t4, .Lbufferend
+    add         a0, a0, t2
+
+.Laligned:
+    vsetvli     zero, t1, e8, m2, ta, mu
+    li          t4, -1
+
+1:
+    vle8.v      v2, (a0)
+    vmseq.vx    v4, v2, zero
+    vfirst.m    t4, v4
+    vmsbf.m     v0, v4
+    vmseq.vx    v0, v2, a1, v0.t
+    vfirst.m    t3, v0
+    bgez        t3, .Lfound
+    bgez        t4, .Lbufferend
+    add         a0, a0, t1
+    j           1b
+
+.Lfound:                                    /* found the target at a0+t3 */
+    add         a0, a0, t3
+    ret
+
+.Lbufferend:
+    add         a0, a0, t4
+    and         a0, a0, t5
+    ret
+
+.size   strchr, .-strchr
+.size   __strchrnul, .-__strchrnul
+
+libc_hidden_builtin_def (strchr)
+weak_alias (__strchrnul, strchrnul)
\ No newline at end of file
diff --git a/sysdeps/riscv/rv64/rvv/strchrnul.c b/sysdeps/riscv/rv64/rvv/strchrnul.c
new file mode 100644
index 0000000000..259da80358
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/strchrnul.c
@@ -0,0 +1,22 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+/* strchrnul is implemented in strchr.S
+ */
diff --git a/sysdeps/riscv/rv64/rvv/strcmp.S b/sysdeps/riscv/rv64/rvv/strcmp.S
new file mode 100644
index 0000000000..4a219221ac
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/strcmp.S
@@ -0,0 +1,108 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+.globl  strcmp
+.type   strcmp,@function
+
+.align    2
+
+/* most of the time, one or both sides is unaligned and their alignments differ
+ * we need to check for a null terminator before crossing a page boundary
+ * strategy:
+ * - for each side, calculate masks for alignment and (vlenb * 2) - alignment
+ * - while no difference encountered:
+ * -   for each side:
+ * -       load bytes to end of next vlenb*2 block
+ * -       check for null terminator
+ * -       if no terminator, load bytes to fill rest of register
+ * -   compare sides
+ */
+
+strcmp:
+    csrr        t1, vlenb                   /* find vlenb*2 */
+    add         t1, t1, t1
+    vsetvli     zero, t1, e8, m2, ta, mu
+    vid.v       v30
+    addi        t2, t1, -1         /* mask for unaligned part of ptr */
+    and         t6, a0, t2         /* unaligned part of lhs */
+    and         t5, a1, t2         /* unaligned part of rhs */
+    sub         t6, t1, t6         /* safe number of lhs bytes to read */
+    sub         t5, t1, t5         /* same, rhs */
+    vmsltu.vx   v28, v30, t6       /* v28 = mask for first half of lhs load */
+    vmsltu.vx   v26, v30, t5       /* v26 = mask for first half of rhs load */
+    vmv.v.x     v16, zero
+    vmv.v.x     v18, zero
+
+1:  vmv.v.v     v0, v28              /* lhs mask */
+    vle8.v      v2, (a0), v0.t       /* masked load from lhs */
+    vmseq.vx    v16, v2, zero, v0.t  /* check loaded bytes for null */
+    vmv.v.v     v0, v26              /*       rhs mask */
+    vfirst.m    t2, v16              /* get lhs check result */
+    bgez        t2, .Ltail           /* bail if we can't safely check rest */
+    vle8.v      v4, (a1), v0.t       /*    masked load from rhs */
+    vmseq.vx    v18, v4, zero, v0.t  /*    check partial rhs for null */
+    vmnot.m     v0, v28              /* mask for rest of lhs */
+    vfirst.m    t3, v18              /* get check result */
+    bltz        t3, 2f               /* test it */
+                                     /* we see null terminator */
+    bge         t3, t6, .Ltail       /* have enough bytes for vector cmp? */
+
+    vmsleu.vx   v0, v30, t3          /* select rest + null */
+    vmsne.vv    v0, v2, v4, v0.t     /* compare */
+    vfirst.m    t3, v0
+    bgez        t3, 3f
+    mv          a0, zero             /* no difference */
+    ret
+3:  add a0, a0, t3
+    add a1, a1, t3
+    lbu t0, (a0)
+    lbu t1, (a1)
+.Ldiff:
+    sub a0, t0, t1
+    ret
+
+    /* ...no null terminator */
+2:  vle8.v      v2, (a0), v0.t       /* load rest of lhs */
+    vmnot.m     v0, v26              /* mask for rest of rhs */
+    vle8.v      v4, (a1), v0.t       /* load rest of rhs */
+    vmsne.vv    v0, v2, v4           /* compare */
+    add         a0, a0, t1           /* advance ptrs */
+    vfirst.m    t3, v0
+    add         a1, a1, t1
+    bltz        t3, 1b
+
+    sub t3, t3, t1 /* found difference but we've already advanced a0 and a1 */
+    j 3b
+
+.Ltail:
+    lbu t0, (a0)
+    lbu t1, (a1)
+    bne t0, t1, .Ldiff
+    addi a0, a0, 1
+    addi a1, a1, 1
+    bnez t0, .Ltail
+    mv a0, zero
+    ret
+
+
+.size strcmp, .-strcmp
+libc_hidden_builtin_def (strcmp)
\ No newline at end of file
diff --git a/sysdeps/riscv/rv64/rvv/strcpy.S b/sysdeps/riscv/rv64/rvv/strcpy.S
new file mode 100644
index 0000000000..b21909d66f
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/strcpy.S
@@ -0,0 +1,72 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+.globl  strcpy
+.type   strcpy,@function
+
+/*
+ *  optimized strcpy for riscv with vector extension
+ *  assumptions:
+ *  - vlenb is a power of 2
+ *  - page size >= 2*vlenb
+ */
+
+.align    2
+strcpy:
+    mv          t0, a0                     /* copy dest so we can return it */
+
+    csrr        t1, vlenb                  /* find vlenb*2 */
+    add         t1, t1, t1
+
+    addi        t2, t1, -1                 /* mask unaligned part of ptr */
+    and         t2, a1, t2
+    beqz        t2, .Laligned
+
+    sub         t2, t1, t2                 /* search enough to align ptr */
+    vsetvli     t2, t2, e8, m2, tu, mu
+    vle8.v      v2, (a1)
+    vmseq.vx    v4, v2, zero
+    vmsif.m     v0, v4                     /* copy but not past null */
+    vfirst.m    t3, v4
+    vse8.v      v2, (t0), v0.t
+    bgez        t3, .Ldone
+    add         t0, t0, t2
+    add         a1, a1, t2
+
+.Laligned:
+    vsetvli     zero, t1, e8, m2, ta, mu   /* now do 2*vlenb bytes per pass */
+
+1:
+    vle8.v      v2, (a1)
+    add         a1, a1, t1
+    vmseq.vx    v4, v2, zero
+    vmsif.m     v0, v4
+    vfirst.m    t3, v4
+    vse8.v      v2, (t0), v0.t
+    add         t0, t0, t1
+    bltz        t3, 1b
+
+.Ldone:
+    ret
+
+.size   strcpy, .-strcpy
+libc_hidden_builtin_def (strcpy)
\ No newline at end of file
diff --git a/sysdeps/riscv/rv64/rvv/strcspn.c b/sysdeps/riscv/rv64/rvv/strcspn.c
new file mode 100644
index 0000000000..f0595a72fb
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/strcspn.c
@@ -0,0 +1,22 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+/* strcspn is implemented in strspn.S
+ */
diff --git a/sysdeps/riscv/rv64/rvv/strlen.S b/sysdeps/riscv/rv64/rvv/strlen.S
new file mode 100644
index 0000000000..c77d500693
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/strlen.S
@@ -0,0 +1,67 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+.globl  strlen
+.type   strlen,@function
+
+/*
+ *  optimized strlen for riscv with vector extension
+ *  assumptions:
+ *  - vlenb is a power of 2
+ *  - page size >= 2*vlenb
+ */
+
+.align    2
+strlen:
+    mv          t4, a0                    /* copy of buffer start */
+    csrr        t1, vlenb                 /* find vlenb*2 */
+    add         t1, t1, t1
+    addi        t2, t1, -1                /* mask off unaligned part of ptr */
+    and         t2, a0, t2
+    beqz        t2, .Laligned
+
+    sub         t2, t1, t2                /* search fwd to align ptr */
+    vsetvli     t2, t2, e8, m2, ta, ma
+    vle8.v      v2, (a0)
+    vmseq.vx    v0, v2, zero
+    vfirst.m    t3, v0
+    bgez        t3, .Lfound
+    add         a0, a0, t2
+
+.Laligned:
+    vsetvli     zero, t1, e8, m2, ta, ma  /* search 2*vlenb bytes per pass */
+    add         t4, t4, t1
+
+1:
+    vle8.v      v2, (a0)
+    add         a0, a0, t1
+    vmseq.vx    v0, v2, zero
+    vfirst.m    t3, v0
+    bltz        t3, 1b
+
+.Lfound:                                  /* found the 0; subtract          */
+    sub         a0, a0, t4                /* buffer start from current ptr  */
+    add         a0, a0, t3                /* and add offset into fetched    */
+    ret                                   /* data to get length */
+
+.size   strlen, .-strlen
+libc_hidden_builtin_def (strlen)
\ No newline at end of file
diff --git a/sysdeps/riscv/rv64/rvv/strncmp.S b/sysdeps/riscv/rv64/rvv/strncmp.S
new file mode 100644
index 0000000000..863e5cb525
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/strncmp.S
@@ -0,0 +1,104 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+.globl  strncmp
+.type   strncmp,@function
+
+.align    2
+
+/* as strcmp, but with added checks on a2 (max count)
+ */
+
+strncmp:
+    csrr        t1, vlenb                   /* find vlenb*2 */
+    add         t1, t1, t1
+    blt         a2, t1, .Ltail              /* degrade if max < vlenb*2 */
+    vsetvli     zero, t1, e8, m2, ta, mu
+    vid.v       v30
+    addi        t2, t1, -1                  /* mask unaligned part of ptr */
+    and         t6, a0, t2                  /* unaligned part of lhs */
+    and         t5, a1, t2                  /* unaligned part of rhs */
+    sub         t6, t1, t6                  /* safe count to read from lhs */
+    sub         t5, t1, t5                  /* same, rhs */
+    vmsltu.vx   v28, v30, t6                /* mask for first part of lhs */
+    vmsltu.vx   v26, v30, t5                /* mask for first part of rhs */
+    vmv.v.x     v16, zero
+    vmv.v.x     v18, zero
+
+
+1:  blt         a2, t1, .Ltail
+    vmv.v.v     v0, v28                     /* lhs mask */
+    vle8.v      v2, (a0), v0.t              /* masked load from lhs */
+    vmseq.vx    v16, v2, zero, v0.t         /* check loaded bytes for null */
+    vmv.v.v     v0, v26                     /*       rhs mask */
+    vfirst.m    t2, v16                     /* get lhs check result */
+    bgez        t2, .Ltail                  /* can we safely check rest */
+    vle8.v      v4, (a1), v0.t              /*       masked load from rhs */
+    vmseq.vx    v18, v4, zero, v0.t         /*       check partial rhs */
+    vmnot.m     v0, v28                     /* mask for rest of lhs */
+    vfirst.m    t3, v18                     /* get check result */
+    bltz        t3, 2f                      /* test it */
+    bge         t3, t6, .Ltail
+
+    vmsleu.vx   v0, v30, t3                 /* select rest of string + null */
+    vmsne.vv    v0, v2, v4, v0.t            /* compare */
+    vfirst.m    t3, v0
+    bgez        t3, 3f
+    mv          a0, zero
+    ret
+3:  add a0, a0, t3
+    add a1, a1, t3
+    lbu t0, (a0)
+    lbu t1, (a1)
+.Ldiff:
+    sub a0, t0, t1
+    ret
+
+    /* ...no null terminator in first part of lhs or rhs */
+2:  vle8.v      v2, (a0), v0.t              /* load rest of lhs */
+    vmnot.m     v0, v26                     /* mask for rest of rhs */
+    vle8.v      v4, (a1), v0.t              /* load rest of rhs */
+    vmsne.vv    v0, v2, v4                  /* compare */
+    add         a0, a0, t1                  /* advance ptrs */
+    vfirst.m    t3, v0
+    add         a1, a1, t1
+    sub         a2, a2, t1
+    bltz        t3, 1b
+
+    sub t3, t3, t1  /* found a diff but we've already advanced a0 and a1 */
+    j 3b
+
+.Ltail:
+    beqz a2, 1f
+    addi a2, a2, -1
+    lbu t0, (a0)
+    lbu t1, (a1)
+    bne t0, t1, .Ldiff
+    addi a0, a0, 1
+    addi a1, a1, 1
+    bnez t0, .Ltail
+1:  mv a0, zero
+    ret
+
+
+.size strncmp, .-strncmp
+libc_hidden_builtin_def (strncmp)
\ No newline at end of file
diff --git a/sysdeps/riscv/rv64/rvv/strncpy.S b/sysdeps/riscv/rv64/rvv/strncpy.S
new file mode 100644
index 0000000000..8b3a1e545c
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/strncpy.S
@@ -0,0 +1,96 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+.globl  strncpy
+.type   strncpy,@function
+
+/*
+ *  optimized strcpy for riscv with vector extension
+ *  assumptions:
+ *  - vlenb is a power of 2
+ *  - page size >= 2*vlenb
+ */
+
+.align    2
+strncpy:
+    mv          t0, a0                    /* need to return dest so copy */
+
+    csrr        t1, vlenb                 /* find vlenb*2 */
+    add         t1, t1, t1
+
+    addi        t2, t1, -1                /* mask off unaligned part of ptr */
+    and         t2, a1, t2
+    beqz        t2, .Laligned
+
+    sub         t2, t1, t2                /* search to align the pointer */
+    vsetvli     zero, t2, e8, m2, tu, mu
+    vle8.v      v2, (a1)
+    vmseq.vx    v4, v2, zero
+    vmsif.m     v0, v4                    /* copy to dest */
+    vfirst.m    t3, v4
+    bgeu        t2, a2, .Ldest_full
+    vse8.v      v2, (t0), v0.t
+    bgez        t3, .Lterminator_found
+    add         t0, t0, t2
+    add         a1, a1, t2
+    sub         a2, a2, t2
+    beqz        a2, .Ldone
+
+.Laligned:
+    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
+
+1:
+    vle8.v      v2, (a1)
+    add         a1, a1, t1
+    vmseq.vx    v4, v2, zero
+    vmsif.m     v0, v4
+    vfirst.m    t3, v4
+    bgeu        t1, a2, .Ldest_full
+    vse8.v      v2, (t0), v0.t
+    add         t0, t0, t1
+    sub         a2, a2, t1
+    bltz        t3, 1b
+    sub         t0, t0, t1
+
+.Lterminator_found:
+    addi        sp, sp, -16
+    sd          ra, 0(sp)
+    sd          a0, 8(sp)
+    add         a0, t0, t3
+    mv          a1, zero
+    sub         a2, a2, t3
+    jal         ra, memset
+    ld          ra, 0(sp)
+    ld          a0, 8(sp)
+    addi        sp, sp, 16
+.Ldone:
+    ret
+
+.Ldest_full:
+    vid.v       v6
+    vmsltu.vx   v4, v6, a2
+    vmand.mm    v0, v0, v4
+    vse8.v      v2, (t0), v0.t
+    ret
+
+.size   strncpy, .-strncpy
+libc_hidden_builtin_def (strncpy)
\ No newline at end of file
diff --git a/sysdeps/riscv/rv64/rvv/strnlen.S b/sysdeps/riscv/rv64/rvv/strnlen.S
new file mode 100644
index 0000000000..6d7ee65c7a
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/strnlen.S
@@ -0,0 +1,81 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+.globl  __strnlen
+.type   __strnlen,@function
+
+/* vector optimized strnlen
+ * assume it's safe to read to the end of the page
+ * containing either a null terminator or the last byte of the count or both,
+ * but not past it
+ * assume page size >= vlenb*2
+ */
+
+.align    2
+__strnlen:
+    mv          t4, a0               /* stash a copy of start for later */
+    beqz        a1, .LzeroCount
+
+    csrr        t1, vlenb            /* find vlenb*2 */
+    add         t1, t1, t1
+    addi        t2, t1, -1           /* mask off unaligned part of ptr */
+    and         t2, a1, a0
+    beqz        t2, .Laligned
+
+    sub         t2, t1, t2           /* search to align pointer to t1 */
+    bgeu        t2, a1, 2f           /* check it's safe */
+    mv          t2, a1               /* it's not! look as far as permitted */
+2:  vsetvli     t2, t2, e8, m2, ta, ma
+    vle8.v      v2, (a0)
+    vmseq.vx    v0, v2, zero
+    vfirst.m    t3, v0
+    bgez        t3, .Lfound
+    add         a0, a0, t2
+    sub         a1, a1, t2
+    bltu        a1, t1, .LreachedCount
+
+.Laligned:
+    vsetvli     zero, t1, e8, m2, ta, ma    /* do 2*vlenb bytes per pass */
+
+1:  vle8.v      v2, (a0)
+    sub         a1, a1, t1
+    vmseq.vx    v0, v2, zero
+    vfirst.m    t3, v0
+    bgez        t3, .Lfound
+    add         a0, a0, t1
+    bgeu        a1, t1, 1b
+.LreachedCount:
+    mv          t2, a1    /* in case 0 < a1 < t1 */
+    bnez        a1, 2b    /* if so, still t2 bytes to check, all safe */
+.LzeroCount:
+    sub         a0, a0, t4
+    ret
+
+.Lfound:        /* found the 0; subtract buffer start from current pointer */
+    add         a0, a0, t3 /* and add offset into fetched data */
+    sub         a0, a0, t4
+    ret
+
+.size   __strnlen, .-__strnlen
+weak_alias (__strnlen, strnlen)
+libc_hidden_builtin_def (__strnlen)
+libc_hidden_builtin_def (strnlen)
\ No newline at end of file
diff --git a/sysdeps/riscv/rv64/rvv/strrchr.S b/sysdeps/riscv/rv64/rvv/strrchr.S
new file mode 100644
index 0000000000..4bef8a3b9c
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/strrchr.S
@@ -0,0 +1,88 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+.globl  strrchr
+.type   strrchr,@function
+
+/*
+ *  optimized strrchr for riscv with vector extension
+ *  assumptions:
+ *  - vlenb is a power of 2
+ *  - page size >= 2*vlenb
+ */
+
+.align    2
+strrchr:
+    mv          t5, a0    /* stash buffer ptr somewhere safe */
+    mv          a0, zero  /* result is nullptr unless we find better below */
+
+    csrr        t1, vlenb                /* determine vlenb*2 */
+    add         t1, t1, t1
+    addi        t2, t1, -1               /* mask off unaligned part of ptr */
+    and         t2, t5, t2
+    beqz        t2, .Laligned
+
+    sub         t2, t1, t2               /* search to align ptr to 2*vlenb */
+    vsetvli     t2, t2, e8, m2, ta, mu
+
+    vle8.v      v2, (t5)                 /* load data into v2(,v3) */
+    vmseq.vx    v4, v2, zero             /* check for null terminator */
+    vfirst.m    t4, v4                   /* grab its position, if any */
+    vmsbf.m     v0, v4                   /* select valid chars */
+    vmseq.vx    v0, v2, a1, v0.t         /* search for candidate byte */
+    vfirst.m    t3, v0                   /* grab its position, if any */
+    bltz        t3, 2f                   /* did we find a candidate? */
+
+3:  add         a0, t3, t5               /* we did! grab the address */
+    vmsof.m     v1, v0                   /* there might be more than one */
+    vmandn.mm   v0, v0, v1               /* so clear the one we just found */
+    vfirst.m    t3, v0                   /* is there another? */
+    bgez        t3, 3b
+
+2:  bgez        t4, .Ldone               /* did we see a null terminator? */
+    add         t5, t5, t2
+
+.Laligned:
+    vsetvli     zero, t1, e8, m2, ta, mu /* now do 2*vlenb bytes per pass */
+
+1:  vle8.v      v2, (t5)
+    vmseq.vx    v4, v2, zero
+    vfirst.m    t4, v4
+    vmsbf.m     v0, v4
+    vmseq.vx    v0, v2, a1, v0.t
+    vfirst.m    t3, v0
+    bltz        t3, 2f
+
+3:  add         a0, t3, t5
+    vmsof.m     v1, v0
+    vmandn.mm   v0, v0, v1
+    vfirst.m    t3, v0
+    bgez        t3, 3b
+
+2:  add         t5, t5, t1
+    bltz        t4, 1b
+
+.Ldone:
+    ret
+
+.size   strrchr, .-strrchr
+libc_hidden_builtin_def (strrchr)
\ No newline at end of file
diff --git a/sysdeps/riscv/rv64/rvv/strspn.S b/sysdeps/riscv/rv64/rvv/strspn.S
new file mode 100644
index 0000000000..2b9af5cc2d
--- /dev/null
+++ b/sysdeps/riscv/rv64/rvv/strspn.S
@@ -0,0 +1,189 @@ 
+
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+.globl  strspn
+.type   strspn,@function
+
+.globl  strcspn
+.type   strcspn,@function
+
+/*
+ *  optimized strspn / strcspn for riscv with vector extension
+ *  assumptions:
+ *  - vlenb is a power of 2
+ *  - page size >= 32
+ *  strategy:
+ *  - build a 256-bit table on the stack, where each elt is zero
+ *    if encountering it should terminate computation and nonzero otherwise
+ *  - use vectorised lookups into this to check 2*vlen elts at a time;
+ *    this code is identical for strspan and strcspan and can be shared
+ *
+ *  note that while V mandates at least 128 bit wide regs,
+ *  we are building a 256 bit lookup table
+ *  therefore we use either LMUL=1 or 2 depending on what the target supports
+ *  therefore we only use even vector register numbers,
+ *  so everything still works if we go with LMUL=2
+ */
+
+# -----------------------------
+
+.align    2
+
+strspn:
+    lbu         t0, 0(a1)
+    bnez        t0, .Lbuild_table
+    mv          a0, zero
+    ret
+
+.Lbuild_table:
+    mv          a6, a0 /* store incoming a0 */
+    li          t1, 32 /* want to deal with 256 bits at a time, so 32 bytes */
+
+    vsetvli     zero, t1, e8, m1, tu, mu
+#if __riscv_v_min_vlen < 256
+    /* we want to build a 256-bit table, so use vlenb*2,
+     * m2 if regs are 128 bits wide or vlenb, m1 if >= 256
+     * 'V' extension specifies a minimum vlen of 128 so this should cover
+     * all cases; we can skip the check if we know vlen >= 256 at compile time
+     */
+    csrr        t2, vlenb
+    bgeu        t2, t1, 1f
+    vsetvli     zero, t1, e8, m2, tu, mu
+1:
+#endif // __riscv_v_min_vlen
+
+    /* read one char from the charset at a time and write the correct bit
+     * in the lookup table; we could do SIMD iff we ever get an extension
+     * that provides some way of scattering bytes into a reg group
+     */
+    vmv.v.x     v16, zero       /* clear out table */
+    vmv.v.x     v8, zero        /* clear out v8 */
+    li          t3, 1
+    vmv.s.x     v8, t3          /* v8 now all zeroes except bottom byte */
+
+1:  vmv.v.x     v2, zero        /* clear out v2 */
+    addi        a1, a1, 1       /* advance charset ptr */
+    srli        t2, t0, 3       /* divide the byte we read earlier by 8 */
+    vslideup.vx v2, v8, t2      /* v2 now 1 in the correct byte 0 elsewhere */
+    vsll.vx     v2, v2, t0      /* v2 now 1 in the correct bit, 0 elsewhere */
+    vor.vv      v16, v16, v2    /* or it in */
+    lbu         t0, 0(a1)       /* fetch next bute */
+    bnez        t0, 1b          /* if it's null, go round again */
+
+/*
+ *   Table is now built in v16.
+ *   Strategy:
+ *   - fetch next t1 bytes from memory
+ *   - vrgather on their values divided by 8 to get relevant bytes of table
+ *   - shift right to get the correct bit into bit 1
+ *   - and with 1, compare with expected terminator value, then check mask
+ *     to see if we've found a terminator
+ *
+ *   Before we can begin, a0 needs to be t1-aligned, so that when we fetch
+ *   the next t1 bytes - any of which may be the null terminator -
+ *   we do not cross a page boundary and read unmapped memory. Therefore
+ *   we have one read of however many bytes are needed to align a0,
+ *   before the main loop.
+ */
+
+.Lscan_table:
+    vmv.v.x     v8, t3              /* v8 now t1 bytes of 0x01 */
+
+    and         t2, a0, t1          /* mask to align to t1 */
+    beqz        t2, 2f              /* or skip if we're already aligned */
+    sub         t2, t1, t2          /* t2 now bytes to read to align to t1 */
+
+    vid.v       v2                  /* build mask instead of changing vl */
+    vmsltu.vx   v0, v2, t2          /* so we don't need to track LMUL */
+
+    vle8.v      v2, (a0), v0.t      /* load next bytes from input */
+    vsrl.vi     v4, v2, 3           /* divide by 8 */
+    vrgather.vv v6, v16, v4         /* corresponding bytes of bit table */
+    vsrl.vv     v6, v6, v2          /* shift correct bits to lsb */
+    vand.vv     v6, v6, v8          /* and with 1 to complete the lookups */
+    vmseq.vx    v4, v6, zero, v0.t  /* check to see if any 0s are present */
+    vfirst.m    t0, v4              /* index of the first 0, if any */
+    bgez        t0, .Lscan_end      /* if we found one, stop */
+    add         a0, a0, t2          /* advance by number of bytes we read */
+
+2:  add         a6, a6, t1     /* we'll advance a0 before the exit check */
+1:  vle8.v      v2, (a0)       /* as above but unmasked so t1 elts per pass */
+    add         a0, a0, t1
+
+    vsrl.vi     v4, v2, 3
+    vrgather.vv v6, v16, v4
+    vsrl.vv     v6, v6, v2
+    vand.vv     v6, v6, v8
+
+    vmseq.vx    v4, v6, zero
+    vfirst.m    t0, v4
+    bltz        t0, 1b
+
+.Lscan_end:
+    add         a0, a0, t0     /* calculate offset to terminating byte */
+    sub         a0, a0, a6
+    ret
+.size   strspn, .-strspn
+
+/* strcspn
+ *
+ * table build exactly as for strspn, except:
+ * - the lookup table starts with all bits except bit 0 of byte 0 set
+ * - we clear the corresponding bit for each byte in the charset
+ * once table is built, we can reuse the scan code directly
+ */
+
+strcspn:
+    lbu         t0, 0(a1)
+    beqz        t0, strlen   /* no rejections -> prefix is whole string */
+
+    mv          a6, a0
+    li          t1, 32
+
+    vsetvli     zero, t1, e8, m1, tu, mu
+#if __riscv_v_min_vlen < 256
+    csrr        t2, vlenb
+    bgeu        t2, t1, 1f
+    vsetvli     zero, t1, e8, m2, tu, mu
+1:
+#endif // __riscv_v_min_vlen
+
+    vmv.v.x     v8, zero
+    li          t3, 1           /* all bits clear except bit 0 of byte 0 */
+    vmv.s.x     v8, t3
+    vnot.v      v16, v8         /* v16 is the inverse of that */
+    li          t4, -1
+
+1:  vmv.v.x     v2, zero
+    addi        a1, a1, 1       /* advance charset ptr */
+    srli        t2, t0, 3       /* select correct bit in v2 */
+    vslideup.vx v2, v8, t2
+    vsll.vx     v2, v2, t0
+    vnot.v      v2, v2          /* invert */
+    vand.vv     v16, v16, v2    /* clear the relevant bit of table */
+    lbu         t0, 0(a1)
+    bnez        t0, 1b
+    j           .Lscan_table
+.size   strcspn, .-strcspn
+
+libc_hidden_builtin_def (strspn)
+libc_hidden_builtin_def (strcspn)
\ No newline at end of file