[RFC,16/19] riscv: Add accelerated strcmp routines

Message ID 20230207001618.458947-17-christoph.muellner@vrull.eu
State New
Headers
Series riscv: ifunc support with optimized mem*/str*/cpu_relax routines |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Christoph Müllner Feb. 7, 2023, 12:16 a.m. UTC
  From: Christoph Müllner <christoph.muellner@vrull.eu>

The implementation of strcmp() can be accelerated using Zbb's orc.b
instruction and fast unaligned accesses. Howver, strcmp can use
unaligned accesses only if such an address does not change the
exception behaviour (compared to a single-byte compare loop).
Let's add an implementation that does all that.
Additionally, let's add the strcmp implementation from the
Bitmanip specification, which does not do any unaligned accesses.

Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
---
 sysdeps/riscv/multiarch/Makefile              |   4 +-
 sysdeps/riscv/multiarch/ifunc-impl-list.c     |   4 +-
 sysdeps/riscv/multiarch/strcmp.c              |  11 +-
 sysdeps/riscv/multiarch/strcmp_zbb.S          | 104 +++++++++
 .../riscv/multiarch/strcmp_zbb_unaligned.S    | 213 ++++++++++++++++++
 5 files changed, 332 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/riscv/multiarch/strcmp_zbb.S
 create mode 100644 sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
  

Comments

Xi Ruoyao Feb. 7, 2023, 11:57 a.m. UTC | #1
Is it possible to make the optimized generic string routine [1] ifunc-
aware in some way?  Or can we "templatize" it so we can make vectorized
string routines more easily?

[1]: https://sourceware.org/pipermail/libc-alpha/2023-February/145211.html

On Tue, 2023-02-07 at 01:16 +0100, Christoph Muellner wrote:
> From: Christoph Müllner <christoph.muellner@vrull.eu>
> 
> The implementation of strcmp() can be accelerated using Zbb's orc.b
> instruction and fast unaligned accesses. Howver, strcmp can use
> unaligned accesses only if such an address does not change the
> exception behaviour (compared to a single-byte compare loop).
> Let's add an implementation that does all that.
> Additionally, let's add the strcmp implementation from the
> Bitmanip specification, which does not do any unaligned accesses.
> 
> Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
> ---
>  sysdeps/riscv/multiarch/Makefile              |   4 +-
>  sysdeps/riscv/multiarch/ifunc-impl-list.c     |   4 +-
>  sysdeps/riscv/multiarch/strcmp.c              |  11 +-
>  sysdeps/riscv/multiarch/strcmp_zbb.S          | 104 +++++++++
>  .../riscv/multiarch/strcmp_zbb_unaligned.S    | 213
> ++++++++++++++++++
>  5 files changed, 332 insertions(+), 4 deletions(-)
>  create mode 100644 sysdeps/riscv/multiarch/strcmp_zbb.S
>  create mode 100644 sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> 
> diff --git a/sysdeps/riscv/multiarch/Makefile
> b/sysdeps/riscv/multiarch/Makefile
> index 3017bde75a..73a62be85d 100644
> --- a/sysdeps/riscv/multiarch/Makefile
> +++ b/sysdeps/riscv/multiarch/Makefile
> @@ -11,5 +11,7 @@ sysdep_routines += \
>         strlen_generic \
>         strlen_zbb \
>         \
> -       strcmp_generic
> +       strcmp_generic \
> +       strcmp_zbb \
> +       strcmp_zbb_unaligned
>  endif
> diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c
> b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> index 64331a4c7f..d354aa1178 100644
> --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> @@ -59,7 +59,9 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
>  
>    IFUNC_IMPL (i, name, strcmp,
> -             IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcmp_generic))
> +             IFUNC_IMPL_ADD (array, i, strcmp, 1,
> __strcmp_zbb_unaligned)
> +             IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_zbb)
> +             IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
>  
>    return i;
>  }
> diff --git a/sysdeps/riscv/multiarch/strcmp.c
> b/sysdeps/riscv/multiarch/strcmp.c
> index 8c21a90afd..d3f2fe19ae 100644
> --- a/sysdeps/riscv/multiarch/strcmp.c
> +++ b/sysdeps/riscv/multiarch/strcmp.c
> @@ -30,8 +30,15 @@
>  
>  extern __typeof (__redirect_strcmp) __libc_strcmp;
>  extern __typeof (__redirect_strcmp) __strcmp_generic
> attribute_hidden;
> -
> -libc_ifunc (__libc_strcmp, __strcmp_generic);
> +extern __typeof (__redirect_strcmp) __strcmp_zbb attribute_hidden;
> +extern __typeof (__redirect_strcmp) __strcmp_zbb_unaligned
> attribute_hidden;
> +
> +libc_ifunc (__libc_strcmp,
> +           HAVE_RV(zbb) && HAVE_FAST_UNALIGNED()
> +           ? __strcmp_zbb_unaligned
> +           : HAVE_RV(zbb)
> +             ? __strcmp_zbb
> +             : __strcmp_generic);
>  
>  # undef strcmp
>  strong_alias (__libc_strcmp, strcmp);
> diff --git a/sysdeps/riscv/multiarch/strcmp_zbb.S
> b/sysdeps/riscv/multiarch/strcmp_zbb.S
> new file mode 100644
> index 0000000000..1c265d6107
> --- /dev/null
> +++ b/sysdeps/riscv/multiarch/strcmp_zbb.S
> @@ -0,0 +1,104 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be
> useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <sys/asm.h>
> +
> +/* Assumptions: rvi_zbb.  */
> +/* Implementation from the Bitmanip specification.  */
> +
> +#define src1           a0
> +#define result         a0
> +#define src2           a1
> +#define data1          a2
> +#define data2          a3
> +#define align          a4
> +#define data1_orcb     t0
> +#define m1             t2
> +
> +#if __riscv_xlen == 64
> +# define REG_L ld
> +# define SZREG 8
> +#else
> +# define REG_L lw
> +# define SZREG 4
> +#endif
> +
> +#ifndef STRCMP
> +# define STRCMP __strcmp_zbb
> +#endif
> +
> +.option push
> +.option arch,+zbb
> +
> +ENTRY_ALIGN (STRCMP, 6)
> +       or      align, src1, src2
> +       and     align, align, SZREG-1
> +       bnez    align, L(simpleloop)
> +       li      m1, -1
> +
> +       /* Main loop for aligned strings.  */
> +       .p2align 2
> +L(loop):
> +       REG_L   data1, 0(src1)
> +       REG_L   data2, 0(src2)
> +       orc.b   data1_orcb, data1
> +       bne     data1_orcb, m1, L(foundnull)
> +       addi    src1, src1, SZREG
> +       addi    src2, src2, SZREG
> +       beq     data1, data2, L(loop)
> +
> +       /* Words don't match, and no null byte in the first word.
> +        * Get bytes in big-endian order and compare.  */
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> +       rev8    data1, data1
> +       rev8    data2, data2
> +#endif
> +       /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless
> sequence.  */
> +       sltu    result, data1, data2
> +       neg     result, result
> +       ori     result, result, 1
> +       ret
> +
> +L(foundnull):
> +       /* Found a null byte.
> +        * If words don't match, fall back to simple loop.  */
> +       bne     data1, data2, L(simpleloop)
> +
> +       /* Otherwise, strings are equal.  */
> +       li      result, 0
> +       ret
> +
> +       /* Simple loop for misaligned strings.  */
> +       .p2align 3
> +L(simpleloop):
> +       lbu     data1, 0(src1)
> +       lbu     data2, 0(src2)
> +       addi    src1, src1, 1
> +       addi    src2, src2, 1
> +       bne     data1, data2, L(sub)
> +       bnez    data1, L(simpleloop)
> +
> +L(sub):
> +       sub     result, data1, data2
> +       ret
> +
> +.option pop
> +
> +END (STRCMP)
> +libc_hidden_builtin_def (STRCMP)
> diff --git a/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> b/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> new file mode 100644
> index 0000000000..ec21982b65
> --- /dev/null
> +++ b/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> @@ -0,0 +1,213 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be
> useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <sys/asm.h>
> +
> +/* Assumptions: rvi_zbb with fast unaligned access.  */
> +/* Implementation inspired by aarch64/strcmp.S.  */
> +
> +#define src1           a0
> +#define result         a0
> +#define src2           a1
> +#define off            a3
> +#define m1             a4
> +#define align1         a5
> +#define src3           a6
> +#define tmp            a7
> +
> +#define data1          t0
> +#define data2          t1
> +#define b1             t0
> +#define b2             t1
> +#define data3          t2
> +#define data1_orcb     t3
> +#define data3_orcb     t4
> +#define shift          t5
> +
> +#if __riscv_xlen == 64
> +# define REG_L ld
> +# define SZREG 8
> +# define PTRLOG        3
> +#else
> +# define REG_L lw
> +# define SZREG 4
> +# define PTRLOG        2
> +#endif
> +
> +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
> +# error big endian is untested!
> +# define CZ    ctz
> +# define SHIFT srl
> +# define SHIFT2        sll
> +#else
> +# define CZ    ctz
> +# define SHIFT sll
> +# define SHIFT2        srl
> +#endif
> +
> +#ifndef STRCMP
> +# define STRCMP __strcmp_zbb_unaligned
> +#endif
> +
> +.option push
> +.option arch,+zbb
> +
> +ENTRY_ALIGN (STRCMP, 6)
> +       /* off...delta from src1 to src2.  */
> +       sub     off, src2, src1
> +       li      m1, -1
> +       andi    tmp, off, SZREG-1
> +       andi    align1, src1, SZREG-1
> +       bnez    tmp, L(misaligned8)
> +       bnez    align1, L(mutual_align)
> +
> +       .p2align 4
> +L(loop_aligned):
> +       REG_L   data1, 0(src1)
> +       add     tmp, src1, off
> +       addi    src1, src1, SZREG
> +       REG_L   data2, 0(tmp)
> +
> +L(start_realigned):
> +       orc.b   data1_orcb, data1
> +       bne     data1_orcb, m1, L(end)
> +       beq     data1, data2, L(loop_aligned)
> +
> +L(fast_end):
> +       /* Words don't match, and no NUL byte in one word.
> +          Get bytes in big-endian order and compare as words.  */
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> +       rev8    data1, data1
> +       rev8    data2, data2
> +#endif
> +       /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless
> sequence.  */
> +       sltu    result, data1, data2
> +       neg     result, result
> +       ori     result, result, 1
> +       ret
> +
> +L(end_orc):
> +       orc.b   data1_orcb, data1
> +L(end):
> +       /* Words don't match or NUL byte in at least one word.
> +          data1_orcb holds orc.b value of data1.  */
> +       xor     tmp, data1, data2
> +       orc.b   tmp, tmp
> +
> +       orn     tmp, tmp, data1_orcb
> +       CZ      shift, tmp
> +
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> +       rev8    data1, data1
> +       rev8    data2, data2
> +#endif
> +       sll     data1, data1, shift
> +       sll     data2, data2, shift
> +       srl     b1, data1, SZREG*8-8
> +       srl     b2, data2, SZREG*8-8
> +
> +L(end_singlebyte):
> +       sub     result, b1, b2
> +       ret
> +
> +       .p2align 4
> +L(mutual_align):
> +       /* Sources are mutually aligned, but are not currently at an
> +          alignment boundary.  Round down the addresses and then mask
> off
> +          the bytes that precede the start point.  */
> +       andi    src1, src1, -SZREG
> +       add     tmp, src1, off
> +       REG_L   data1, 0(src1)
> +       addi    src1, src1, SZREG
> +       REG_L   data2, 0(tmp)
> +       /* Get number of bits to mask.  */
> +       sll     shift, src2, 3
> +       /* Bits to mask are now 0, others are 1.  */
> +       SHIFT   tmp, m1, shift
> +       /* Or with inverted value -> masked bits become 1.  */
> +       orn     data1, data1, tmp
> +       orn     data2, data2, tmp
> +       j       L(start_realigned)
> +
> +L(misaligned8):
> +       /* Skip slow loop if SRC1 is aligned.  */
> +       beqz    align1, L(src1_aligned)
> +L(do_misaligned):
> +       /* Align SRC1 to 8 bytes.  */
> +       lbu     b1, 0(src1)
> +       lbu     b2, 0(src2)
> +       beqz    b1, L(end_singlebyte)
> +       bne     b1, b2, L(end_singlebyte)
> +       addi    src1, src1, 1
> +       addi    src2, src2, 1
> +       andi    align1, src1, SZREG-1
> +       bnez    align1, L(do_misaligned)
> +
> +L(src1_aligned):
> +       /* SRC1 is aligned. Align SRC2 down and check for NUL there.
> +        * If there is no NUL, we may read the next word from SRC2.
> +        * If there is a NUL, we must not read a complete word from
> SRC2
> +        * because we might cross a page boundary.  */
> +       /* Get number of bits to mask (upper bits are ignored by
> shifts).  */
> +       sll     shift, src2, 3
> +       /* src3 := align_down (src2)  */
> +       andi    src3, src2, -SZREG
> +       REG_L   data3, 0(src3)
> +       addi    src3, src3, SZREG
> +
> +       /* Bits to mask are now 0, others are 1.  */
> +       SHIFT   tmp, m1, shift
> +       /* Or with inverted value -> masked bits become 1.  */
> +       orn     data3_orcb, data3, tmp
> +       /* Check for NUL in next aligned word.  */
> +       orc.b   data3_orcb, data3_orcb
> +       bne     data3_orcb, m1, L(unaligned_nul)
> +
> +       .p2align 4
> +L(loop_unaligned):
> +       /* Read the (aligned) data1 and the unaligned data2.  */
> +       REG_L   data1, 0(src1)
> +       addi    src1, src1, SZREG
> +       REG_L   data2, 0(src2)
> +       addi    src2, src2, SZREG
> +       orc.b   data1_orcb, data1
> +       bne     data1_orcb, m1, L(end)
> +       bne     data1, data2, L(end)
> +
> +       /* Read the next aligned-down word.  */
> +       REG_L   data3, 0(src3)
> +       addi    src3, src3, SZREG
> +       orc.b   data3_orcb, data3
> +       beq     data3_orcb, m1, L(loop_unaligned)
> +
> +L(unaligned_nul):
> +       /* src1 points to unread word (only first bytes relevant).
> +        * data3 holds next aligned-down word with NUL.
> +        * Compare the first bytes of data1 with the last bytes of
> data3.  */
> +       REG_L   data1, 0(src1)
> +       /* Shift NUL bytes into data3 to become data2.  */
> +       SHIFT2  data2, data3, shift
> +       bne     data1, data2, L(end_orc)
> +       li      result, 0
> +       ret
> +
> +.option pop
> +
> +END (STRCMP)
> +libc_hidden_builtin_def (STRCMP)
  
Christoph Müllner Feb. 7, 2023, 2:15 p.m. UTC | #2
On Tue, Feb 7, 2023 at 12:57 PM Xi Ruoyao <xry111@xry111.site> wrote:

> Is it possible to make the optimized generic string routine [1] ifunc-
> aware in some way?  Or can we "templatize" it so we can make vectorized
> string routines more easily?
>

You mean to compile the generic code multiple times with different sets
of enabled extensions? Yes, that should work (my patchset does the same
for memset). Let's wait for the referenced patchset to land (looks like this
will happen soon).

Thanks,
Christoph



>
> [1]: https://sourceware.org/pipermail/libc-alpha/2023-February/145211.html
>
> On Tue, 2023-02-07 at 01:16 +0100, Christoph Muellner wrote:
> > From: Christoph Müllner <christoph.muellner@vrull.eu>
> >
> > The implementation of strcmp() can be accelerated using Zbb's orc.b
> > instruction and fast unaligned accesses. Howver, strcmp can use
> > unaligned accesses only if such an address does not change the
> > exception behaviour (compared to a single-byte compare loop).
> > Let's add an implementation that does all that.
> > Additionally, let's add the strcmp implementation from the
> > Bitmanip specification, which does not do any unaligned accesses.
> >
> > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
> > ---
> >  sysdeps/riscv/multiarch/Makefile              |   4 +-
> >  sysdeps/riscv/multiarch/ifunc-impl-list.c     |   4 +-
> >  sysdeps/riscv/multiarch/strcmp.c              |  11 +-
> >  sysdeps/riscv/multiarch/strcmp_zbb.S          | 104 +++++++++
> >  .../riscv/multiarch/strcmp_zbb_unaligned.S    | 213
> > ++++++++++++++++++
> >  5 files changed, 332 insertions(+), 4 deletions(-)
> >  create mode 100644 sysdeps/riscv/multiarch/strcmp_zbb.S
> >  create mode 100644 sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> >
> > diff --git a/sysdeps/riscv/multiarch/Makefile
> > b/sysdeps/riscv/multiarch/Makefile
> > index 3017bde75a..73a62be85d 100644
> > --- a/sysdeps/riscv/multiarch/Makefile
> > +++ b/sysdeps/riscv/multiarch/Makefile
> > @@ -11,5 +11,7 @@ sysdep_routines += \
> >         strlen_generic \
> >         strlen_zbb \
> >         \
> > -       strcmp_generic
> > +       strcmp_generic \
> > +       strcmp_zbb \
> > +       strcmp_zbb_unaligned
> >  endif
> > diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > index 64331a4c7f..d354aa1178 100644
> > --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > @@ -59,7 +59,9 @@ __libc_ifunc_impl_list (const char *name, struct
> > libc_ifunc_impl *array,
> >               IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
> >
> >    IFUNC_IMPL (i, name, strcmp,
> > -             IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcmp_generic))
> > +             IFUNC_IMPL_ADD (array, i, strcmp, 1,
> > __strcmp_zbb_unaligned)
> > +             IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_zbb)
> > +             IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
> >
> >    return i;
> >  }
> > diff --git a/sysdeps/riscv/multiarch/strcmp.c
> > b/sysdeps/riscv/multiarch/strcmp.c
> > index 8c21a90afd..d3f2fe19ae 100644
> > --- a/sysdeps/riscv/multiarch/strcmp.c
> > +++ b/sysdeps/riscv/multiarch/strcmp.c
> > @@ -30,8 +30,15 @@
> >
> >  extern __typeof (__redirect_strcmp) __libc_strcmp;
> >  extern __typeof (__redirect_strcmp) __strcmp_generic
> > attribute_hidden;
> > -
> > -libc_ifunc (__libc_strcmp, __strcmp_generic);
> > +extern __typeof (__redirect_strcmp) __strcmp_zbb attribute_hidden;
> > +extern __typeof (__redirect_strcmp) __strcmp_zbb_unaligned
> > attribute_hidden;
> > +
> > +libc_ifunc (__libc_strcmp,
> > +           HAVE_RV(zbb) && HAVE_FAST_UNALIGNED()
> > +           ? __strcmp_zbb_unaligned
> > +           : HAVE_RV(zbb)
> > +             ? __strcmp_zbb
> > +             : __strcmp_generic);
> >
> >  # undef strcmp
> >  strong_alias (__libc_strcmp, strcmp);
> > diff --git a/sysdeps/riscv/multiarch/strcmp_zbb.S
> > b/sysdeps/riscv/multiarch/strcmp_zbb.S
> > new file mode 100644
> > index 0000000000..1c265d6107
> > --- /dev/null
> > +++ b/sysdeps/riscv/multiarch/strcmp_zbb.S
> > @@ -0,0 +1,104 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be
> > useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <sysdep.h>
> > +#include <sys/asm.h>
> > +
> > +/* Assumptions: rvi_zbb.  */
> > +/* Implementation from the Bitmanip specification.  */
> > +
> > +#define src1           a0
> > +#define result         a0
> > +#define src2           a1
> > +#define data1          a2
> > +#define data2          a3
> > +#define align          a4
> > +#define data1_orcb     t0
> > +#define m1             t2
> > +
> > +#if __riscv_xlen == 64
> > +# define REG_L ld
> > +# define SZREG 8
> > +#else
> > +# define REG_L lw
> > +# define SZREG 4
> > +#endif
> > +
> > +#ifndef STRCMP
> > +# define STRCMP __strcmp_zbb
> > +#endif
> > +
> > +.option push
> > +.option arch,+zbb
> > +
> > +ENTRY_ALIGN (STRCMP, 6)
> > +       or      align, src1, src2
> > +       and     align, align, SZREG-1
> > +       bnez    align, L(simpleloop)
> > +       li      m1, -1
> > +
> > +       /* Main loop for aligned strings.  */
> > +       .p2align 2
> > +L(loop):
> > +       REG_L   data1, 0(src1)
> > +       REG_L   data2, 0(src2)
> > +       orc.b   data1_orcb, data1
> > +       bne     data1_orcb, m1, L(foundnull)
> > +       addi    src1, src1, SZREG
> > +       addi    src2, src2, SZREG
> > +       beq     data1, data2, L(loop)
> > +
> > +       /* Words don't match, and no null byte in the first word.
> > +        * Get bytes in big-endian order and compare.  */
> > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > +       rev8    data1, data1
> > +       rev8    data2, data2
> > +#endif
> > +       /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless
> > sequence.  */
> > +       sltu    result, data1, data2
> > +       neg     result, result
> > +       ori     result, result, 1
> > +       ret
> > +
> > +L(foundnull):
> > +       /* Found a null byte.
> > +        * If words don't match, fall back to simple loop.  */
> > +       bne     data1, data2, L(simpleloop)
> > +
> > +       /* Otherwise, strings are equal.  */
> > +       li      result, 0
> > +       ret
> > +
> > +       /* Simple loop for misaligned strings.  */
> > +       .p2align 3
> > +L(simpleloop):
> > +       lbu     data1, 0(src1)
> > +       lbu     data2, 0(src2)
> > +       addi    src1, src1, 1
> > +       addi    src2, src2, 1
> > +       bne     data1, data2, L(sub)
> > +       bnez    data1, L(simpleloop)
> > +
> > +L(sub):
> > +       sub     result, data1, data2
> > +       ret
> > +
> > +.option pop
> > +
> > +END (STRCMP)
> > +libc_hidden_builtin_def (STRCMP)
> > diff --git a/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> > b/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> > new file mode 100644
> > index 0000000000..ec21982b65
> > --- /dev/null
> > +++ b/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> > @@ -0,0 +1,213 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be
> > useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <sysdep.h>
> > +#include <sys/asm.h>
> > +
> > +/* Assumptions: rvi_zbb with fast unaligned access.  */
> > +/* Implementation inspired by aarch64/strcmp.S.  */
> > +
> > +#define src1           a0
> > +#define result         a0
> > +#define src2           a1
> > +#define off            a3
> > +#define m1             a4
> > +#define align1         a5
> > +#define src3           a6
> > +#define tmp            a7
> > +
> > +#define data1          t0
> > +#define data2          t1
> > +#define b1             t0
> > +#define b2             t1
> > +#define data3          t2
> > +#define data1_orcb     t3
> > +#define data3_orcb     t4
> > +#define shift          t5
> > +
> > +#if __riscv_xlen == 64
> > +# define REG_L ld
> > +# define SZREG 8
> > +# define PTRLOG        3
> > +#else
> > +# define REG_L lw
> > +# define SZREG 4
> > +# define PTRLOG        2
> > +#endif
> > +
> > +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
> > +# error big endian is untested!
> > +# define CZ    ctz
> > +# define SHIFT srl
> > +# define SHIFT2        sll
> > +#else
> > +# define CZ    ctz
> > +# define SHIFT sll
> > +# define SHIFT2        srl
> > +#endif
> > +
> > +#ifndef STRCMP
> > +# define STRCMP __strcmp_zbb_unaligned
> > +#endif
> > +
> > +.option push
> > +.option arch,+zbb
> > +
> > +ENTRY_ALIGN (STRCMP, 6)
> > +       /* off...delta from src1 to src2.  */
> > +       sub     off, src2, src1
> > +       li      m1, -1
> > +       andi    tmp, off, SZREG-1
> > +       andi    align1, src1, SZREG-1
> > +       bnez    tmp, L(misaligned8)
> > +       bnez    align1, L(mutual_align)
> > +
> > +       .p2align 4
> > +L(loop_aligned):
> > +       REG_L   data1, 0(src1)
> > +       add     tmp, src1, off
> > +       addi    src1, src1, SZREG
> > +       REG_L   data2, 0(tmp)
> > +
> > +L(start_realigned):
> > +       orc.b   data1_orcb, data1
> > +       bne     data1_orcb, m1, L(end)
> > +       beq     data1, data2, L(loop_aligned)
> > +
> > +L(fast_end):
> > +       /* Words don't match, and no NUL byte in one word.
> > +          Get bytes in big-endian order and compare as words.  */
> > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > +       rev8    data1, data1
> > +       rev8    data2, data2
> > +#endif
> > +       /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless
> > sequence.  */
> > +       sltu    result, data1, data2
> > +       neg     result, result
> > +       ori     result, result, 1
> > +       ret
> > +
> > +L(end_orc):
> > +       orc.b   data1_orcb, data1
> > +L(end):
> > +       /* Words don't match or NUL byte in at least one word.
> > +          data1_orcb holds orc.b value of data1.  */
> > +       xor     tmp, data1, data2
> > +       orc.b   tmp, tmp
> > +
> > +       orn     tmp, tmp, data1_orcb
> > +       CZ      shift, tmp
> > +
> > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > +       rev8    data1, data1
> > +       rev8    data2, data2
> > +#endif
> > +       sll     data1, data1, shift
> > +       sll     data2, data2, shift
> > +       srl     b1, data1, SZREG*8-8
> > +       srl     b2, data2, SZREG*8-8
> > +
> > +L(end_singlebyte):
> > +       sub     result, b1, b2
> > +       ret
> > +
> > +       .p2align 4
> > +L(mutual_align):
> > +       /* Sources are mutually aligned, but are not currently at an
> > +          alignment boundary.  Round down the addresses and then mask
> > off
> > +          the bytes that precede the start point.  */
> > +       andi    src1, src1, -SZREG
> > +       add     tmp, src1, off
> > +       REG_L   data1, 0(src1)
> > +       addi    src1, src1, SZREG
> > +       REG_L   data2, 0(tmp)
> > +       /* Get number of bits to mask.  */
> > +       sll     shift, src2, 3
> > +       /* Bits to mask are now 0, others are 1.  */
> > +       SHIFT   tmp, m1, shift
> > +       /* Or with inverted value -> masked bits become 1.  */
> > +       orn     data1, data1, tmp
> > +       orn     data2, data2, tmp
> > +       j       L(start_realigned)
> > +
> > +L(misaligned8):
> > +       /* Skip slow loop if SRC1 is aligned.  */
> > +       beqz    align1, L(src1_aligned)
> > +L(do_misaligned):
> > +       /* Align SRC1 to 8 bytes.  */
> > +       lbu     b1, 0(src1)
> > +       lbu     b2, 0(src2)
> > +       beqz    b1, L(end_singlebyte)
> > +       bne     b1, b2, L(end_singlebyte)
> > +       addi    src1, src1, 1
> > +       addi    src2, src2, 1
> > +       andi    align1, src1, SZREG-1
> > +       bnez    align1, L(do_misaligned)
> > +
> > +L(src1_aligned):
> > +       /* SRC1 is aligned. Align SRC2 down and check for NUL there.
> > +        * If there is no NUL, we may read the next word from SRC2.
> > +        * If there is a NUL, we must not read a complete word from
> > SRC2
> > +        * because we might cross a page boundary.  */
> > +       /* Get number of bits to mask (upper bits are ignored by
> > shifts).  */
> > +       sll     shift, src2, 3
> > +       /* src3 := align_down (src2)  */
> > +       andi    src3, src2, -SZREG
> > +       REG_L   data3, 0(src3)
> > +       addi    src3, src3, SZREG
> > +
> > +       /* Bits to mask are now 0, others are 1.  */
> > +       SHIFT   tmp, m1, shift
> > +       /* Or with inverted value -> masked bits become 1.  */
> > +       orn     data3_orcb, data3, tmp
> > +       /* Check for NUL in next aligned word.  */
> > +       orc.b   data3_orcb, data3_orcb
> > +       bne     data3_orcb, m1, L(unaligned_nul)
> > +
> > +       .p2align 4
> > +L(loop_unaligned):
> > +       /* Read the (aligned) data1 and the unaligned data2.  */
> > +       REG_L   data1, 0(src1)
> > +       addi    src1, src1, SZREG
> > +       REG_L   data2, 0(src2)
> > +       addi    src2, src2, SZREG
> > +       orc.b   data1_orcb, data1
> > +       bne     data1_orcb, m1, L(end)
> > +       bne     data1, data2, L(end)
> > +
> > +       /* Read the next aligned-down word.  */
> > +       REG_L   data3, 0(src3)
> > +       addi    src3, src3, SZREG
> > +       orc.b   data3_orcb, data3
> > +       beq     data3_orcb, m1, L(loop_unaligned)
> > +
> > +L(unaligned_nul):
> > +       /* src1 points to unread word (only first bytes relevant).
> > +        * data3 holds next aligned-down word with NUL.
> > +        * Compare the first bytes of data1 with the last bytes of
> > data3.  */
> > +       REG_L   data1, 0(src1)
> > +       /* Shift NUL bytes into data3 to become data2.  */
> > +       SHIFT2  data2, data3, shift
> > +       bne     data1, data2, L(end_orc)
> > +       li      result, 0
> > +       ret
> > +
> > +.option pop
> > +
> > +END (STRCMP)
> > +libc_hidden_builtin_def (STRCMP)
>
> --
> Xi Ruoyao <xry111@xry111.site>
> School of Aerospace Science and Technology, Xidian University
>
  
Jeff Law March 31, 2023, 5:06 a.m. UTC | #3
On 2/7/23 07:15, Christoph Müllner wrote:

>      > diff --git a/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
>      > b/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
[ ... ]


>      > +
>      > +ENTRY_ALIGN (STRCMP, 6)
>      > +       /* off...delta from src1 to src2.  */
>      > +       sub     off, src2, src1
>      > +       li      m1, -1
>      > +       andi    tmp, off, SZREG-1
>      > +       andi    align1, src1, SZREG-1
>      > +       bnez    tmp, L(misaligned8)
>      > +       bnez    align1, L(mutual_align)
>      > +
>      > +       .p2align 4
>      > +L(loop_aligned):
>      > +       REG_L   data1, 0(src1)
>      > +       add     tmp, src1, off
>      > +       addi    src1, src1, SZREG
>      > +       REG_L   data2, 0(tmp)

So any thoughts on reducing the alignment?  Based on the data I've seen 
we very rarely ever take the branch to L(loop_aligned).    So aligning 
this particular label is of dubious value to begin with.  As it stands 
we have to emit 3 full sized nops to achieve the requested alignment and 
they can burn most of an issue cycle.

While it's highly dependent on pipeline state, there's a reasonable 
chance of shaving a cycle by reducing the alignment to p2align 3.

I haven't done much with analyzing the rest of the code as it just 
hasn't been hot in any of the cases I've looked at.

I'd be comfortable with this going in as-is or with the alignment 
adjustment.  Obviously wiring it up via ifunc is dependent upon settling 
the kernel->glibc interface.

Jeff
  
Adhemerval Zanella March 31, 2023, 12:31 p.m. UTC | #4
On 31/03/23 02:06, Jeff Law wrote:
> 
> 
> On 2/7/23 07:15, Christoph Müllner wrote:
> 
>>      > diff --git a/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
>>      > b/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> [ ... ]
> 
> 
>>      > +
>>      > +ENTRY_ALIGN (STRCMP, 6)
>>      > +       /* off...delta from src1 to src2.  */
>>      > +       sub     off, src2, src1
>>      > +       li      m1, -1
>>      > +       andi    tmp, off, SZREG-1
>>      > +       andi    align1, src1, SZREG-1
>>      > +       bnez    tmp, L(misaligned8)
>>      > +       bnez    align1, L(mutual_align)
>>      > +
>>      > +       .p2align 4
>>      > +L(loop_aligned):
>>      > +       REG_L   data1, 0(src1)
>>      > +       add     tmp, src1, off
>>      > +       addi    src1, src1, SZREG
>>      > +       REG_L   data2, 0(tmp)
> 
> So any thoughts on reducing the alignment?  Based on the data I've seen we very rarely ever take the branch to L(loop_aligned).    So aligning this particular label is of dubious value to begin with.  As it stands we have to emit 3 full sized nops to achieve the requested alignment and they can burn most of an issue cycle.
> 
> While it's highly dependent on pipeline state, there's a reasonable chance of shaving a cycle by reducing the alignment to p2align 3.
> 
> I haven't done much with analyzing the rest of the code as it just hasn't been hot in any of the cases I've looked at.
> 
> I'd be comfortable with this going in as-is or with the alignment adjustment.  Obviously wiring it up via ifunc is dependent upon settling the kernel->glibc interface.
> 
> Jeff

Is this implementation really better than new generic one [1]? With a target
with zbb support, the generic word comparison should use orc.b instruction [2].
And the final comparison, once with the last word or the mismatch word is found,
should use clz/ctz instruction [3] (result also in branchless code, albeit
I have not check if better than the snippet this implementation uses).

The generic implementation also has the advantage of use word instruction
on unaligned case, where this implementation does a naive byte per byte
check.

So maybe a better option would to optimize further the generic implementation.
One option might be to parametrize the final_cmp so you can use the branchless
trick (if it indeed is better than generic code).  Another option that the 
generic implementation does not explore is manual loop unrolling, as done by 
multiple assembly implementations.

[1] https://sourceware.org/git/?p=glibc.git;a=blob;f=string/strcmp.c;h=11ec8bac816b630417ccbfeba70f9eab6ec37874;hb=HEAD
[2] https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/riscv/string-fza.h;h=4429653a001de09730cfa83325b29556a8afb5ed;hb=HEAD
[3] https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/generic/string-fzi.h;h=2deecefc236833abffbee886851d75e7ecf66755;hb=HEAD
  
Jeff Law March 31, 2023, 2:30 p.m. UTC | #5
On 3/31/23 06:31, Adhemerval Zanella Netto wrote:

>> Jeff
> 
> Is this implementation really better than new generic one [1]? With a target
> with zbb support, the generic word comparison should use orc.b instruction [2].
> And the final comparison, once with the last word or the mismatch word is found,
> should use clz/ctz instruction [3] (result also in branchless code, albeit
> I have not check if better than the snippet this implementation uses).
I haven't done any comparisons against the updated generic bits.  I 
nearly suggested to Christoph to do that evaluation, but when I wandered 
around sysdeps I saw that we still had multiple custom strcmp 
implementations and set that suggestion aside.


> 
> The generic implementation also has the advantage of use word instruction
> on unaligned case, where this implementation does a naive byte per byte
> check.
Yea, but in my digging this just didn't happen terribly often.  I don't 
think there's a lot of value there.  Along the same lines, my 
investigation didn't show any significant value to realign cases and I 
nearly suggested dropping them to avoid the branch in the hot path, but 
I wasn't confident enough in the breadth of my investigations to push it.

> 
> So maybe a better option would to optimize further the generic implementation.
> One option might be to parametrize the final_cmp so you can use the branchless
> trick (if it indeed is better than generic code).  Another option that the
> generic implementation does not explore is manual loop unrolling, as done by
> multiple assembly implementations.
I could certainly support that.  I was on the fence about pushing to use 
the generic bits, a little nudge could easily push me to that side.

jeff
  
Jeff Law March 31, 2023, 2:32 p.m. UTC | #6
On 2/7/23 07:15, Christoph Müllner wrote:
> 
> On Tue, Feb 7, 2023 at 12:57 PM Xi Ruoyao <xry111@xry111.site> wrote:
> 
>     Is it possible to make the optimized generic string routine [1] ifunc-
>     aware in some way?  Or can we "templatize" it so we can make vectorized
>     string routines more easily?
> 
> 
> You mean to compile the generic code multiple times with different sets
> of enabled extensions? Yes, that should work (my patchset does the same
> for memset). Let's wait for the referenced patchset to land (looks like this
> will happen soon).
We would still need that kernel interface to be sorted out to use 
function multi-versioning.  Under the hood what GCC generates for FMV 
looks a lot like an ifunc resolver.

jeff
  
Adhemerval Zanella March 31, 2023, 2:48 p.m. UTC | #7
On 31/03/23 11:30, Jeff Law wrote:
> 
> 
> On 3/31/23 06:31, Adhemerval Zanella Netto wrote:
> 
>>> Jeff
>>
>> Is this implementation really better than new generic one [1]? With a target
>> with zbb support, the generic word comparison should use orc.b instruction [2].
>> And the final comparison, once with the last word or the mismatch word is found,
>> should use clz/ctz instruction [3] (result also in branchless code, albeit
>> I have not check if better than the snippet this implementation uses).
> I haven't done any comparisons against the updated generic bits.  I nearly suggested to Christoph to do that evaluation, but when I wandered around sysdeps I saw that we still had multiple custom strcmp implementations and set that suggestion aside.
> 
> 
>>
>> The generic implementation also has the advantage of use word instruction
>> on unaligned case, where this implementation does a naive byte per byte
>> check.
> Yea, but in my digging this just didn't happen terribly often.  I don't think there's a lot of value there.  Along the same lines, my investigation didn't show any significant value to realign cases and I nearly suggested dropping them to avoid the branch in the hot path, but I wasn't confident enough in the breadth of my investigations to push it.
> >>
>> So maybe a better option would to optimize further the generic implementation.
>> One option might be to parametrize the final_cmp so you can use the branchless
>> trick (if it indeed is better than generic code).  Another option that the
>> generic implementation does not explore is manual loop unrolling, as done by
>> multiple assembly implementations.
> I could certainly support that.  I was on the fence about pushing to use the generic bits, a little nudge could easily push me to that side.

The initial realign could be tuned, I added mostly because it simplifies both
aligned and unaligned case a lot.  But it should be doable to use a similar
strategy as strchr/strlen to mask off the bits based on the input alignment.

The unaligned case is just to avoid drastic performance different between
input alignment, it is cheap and in the end should just be additional code
size.

But the main gain of using the generic implementation is one less assembly
routine to maintain and tune; and by improving the generic implementation
we gain in ecosystem as whole.
  
Palmer Dabbelt March 31, 2023, 5:19 p.m. UTC | #8
On Fri, 31 Mar 2023 07:48:43 PDT (-0700), adhemerval.zanella@linaro.org wrote:
>
>
> On 31/03/23 11:30, Jeff Law wrote:
>>
>>
>> On 3/31/23 06:31, Adhemerval Zanella Netto wrote:
>>
>>>> Jeff
>>>
>>> Is this implementation really better than new generic one [1]? With a target
>>> with zbb support, the generic word comparison should use orc.b instruction [2].
>>> And the final comparison, once with the last word or the mismatch word is found,
>>> should use clz/ctz instruction [3] (result also in branchless code, albeit
>>> I have not check if better than the snippet this implementation uses).
>> I haven't done any comparisons against the updated generic bits.  I nearly suggested to Christoph to do that evaluation, but when I wandered around sysdeps I saw that we still had multiple custom strcmp implementations and set that suggestion aside.
>>
>>
>>>
>>> The generic implementation also has the advantage of use word instruction
>>> on unaligned case, where this implementation does a naive byte per byte
>>> check.
>> Yea, but in my digging this just didn't happen terribly often.  I don't think there's a lot of value there.  Along the same lines, my investigation didn't show any significant value to realign cases and I nearly suggested dropping them to avoid the branch in the hot path, but I wasn't confident enough in the breadth of my investigations to push it.
>> >>
>>> So maybe a better option would to optimize further the generic implementation.
>>> One option might be to parametrize the final_cmp so you can use the branchless
>>> trick (if it indeed is better than generic code).  Another option that the
>>> generic implementation does not explore is manual loop unrolling, as done by
>>> multiple assembly implementations.
>> I could certainly support that.  I was on the fence about pushing to use the generic bits, a little nudge could easily push me to that side.
>
> The initial realign could be tuned, I added mostly because it simplifies both
> aligned and unaligned case a lot.  But it should be doable to use a similar
> strategy as strchr/strlen to mask off the bits based on the input alignment.
>
> The unaligned case is just to avoid drastic performance different between
> input alignment, it is cheap and in the end should just be additional code
> size.
>
> But the main gain of using the generic implementation is one less assembly
> routine to maintain and tune; and by improving the generic implementation
> we gain in ecosystem as whole.

I think we should use the generic stuff where we can, just to avoid 
extra maintiance issues.  I think we'll eventually end up with 
vendor-specific assembly routines, but IMO it's best to only merge those 
if there's a meaningful performance advantage and there's no way to 
replicate it without resorting to assembly.
  

Patch

diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile
index 3017bde75a..73a62be85d 100644
--- a/sysdeps/riscv/multiarch/Makefile
+++ b/sysdeps/riscv/multiarch/Makefile
@@ -11,5 +11,7 @@  sysdep_routines += \
 	strlen_generic \
 	strlen_zbb \
 	\
-	strcmp_generic
+	strcmp_generic \
+	strcmp_zbb \
+	strcmp_zbb_unaligned
 endif
diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c
index 64331a4c7f..d354aa1178 100644
--- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
+++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
@@ -59,7 +59,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
 
   IFUNC_IMPL (i, name, strcmp,
-	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcmp_generic))
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_zbb_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_zbb)
+	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
 
   return i;
 }
diff --git a/sysdeps/riscv/multiarch/strcmp.c b/sysdeps/riscv/multiarch/strcmp.c
index 8c21a90afd..d3f2fe19ae 100644
--- a/sysdeps/riscv/multiarch/strcmp.c
+++ b/sysdeps/riscv/multiarch/strcmp.c
@@ -30,8 +30,15 @@ 
 
 extern __typeof (__redirect_strcmp) __libc_strcmp;
 extern __typeof (__redirect_strcmp) __strcmp_generic attribute_hidden;
-
-libc_ifunc (__libc_strcmp, __strcmp_generic);
+extern __typeof (__redirect_strcmp) __strcmp_zbb attribute_hidden;
+extern __typeof (__redirect_strcmp) __strcmp_zbb_unaligned attribute_hidden;
+
+libc_ifunc (__libc_strcmp,
+	    HAVE_RV(zbb) && HAVE_FAST_UNALIGNED()
+	    ? __strcmp_zbb_unaligned
+	    : HAVE_RV(zbb)
+	      ? __strcmp_zbb
+	      : __strcmp_generic);
 
 # undef strcmp
 strong_alias (__libc_strcmp, strcmp);
diff --git a/sysdeps/riscv/multiarch/strcmp_zbb.S b/sysdeps/riscv/multiarch/strcmp_zbb.S
new file mode 100644
index 0000000000..1c265d6107
--- /dev/null
+++ b/sysdeps/riscv/multiarch/strcmp_zbb.S
@@ -0,0 +1,104 @@ 
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+/* Assumptions: rvi_zbb.  */
+/* Implementation from the Bitmanip specification.  */
+
+#define src1		a0
+#define result		a0
+#define src2		a1
+#define data1		a2
+#define data2		a3
+#define align		a4
+#define data1_orcb	t0
+#define m1		t2
+
+#if __riscv_xlen == 64
+# define REG_L	ld
+# define SZREG	8
+#else
+# define REG_L	lw
+# define SZREG	4
+#endif
+
+#ifndef STRCMP
+# define STRCMP __strcmp_zbb
+#endif
+
+.option push
+.option arch,+zbb
+
+ENTRY_ALIGN (STRCMP, 6)
+	or	align, src1, src2
+	and	align, align, SZREG-1
+	bnez	align, L(simpleloop)
+	li	m1, -1
+
+	/* Main loop for aligned strings.  */
+	.p2align 2
+L(loop):
+	REG_L	data1, 0(src1)
+	REG_L	data2, 0(src2)
+	orc.b	data1_orcb, data1
+	bne	data1_orcb, m1, L(foundnull)
+	addi	src1, src1, SZREG
+	addi	src2, src2, SZREG
+	beq	data1, data2, L(loop)
+
+	/* Words don't match, and no null byte in the first word.
+	 * Get bytes in big-endian order and compare.  */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	rev8	data1, data1
+	rev8	data2, data2
+#endif
+	/* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence.  */
+	sltu	result, data1, data2
+	neg	result, result
+	ori	result, result, 1
+	ret
+
+L(foundnull):
+	/* Found a null byte.
+	 * If words don't match, fall back to simple loop.  */
+	bne	data1, data2, L(simpleloop)
+
+	/* Otherwise, strings are equal.  */
+	li	result, 0
+	ret
+
+	/* Simple loop for misaligned strings.  */
+	.p2align 3
+L(simpleloop):
+	lbu	data1, 0(src1)
+	lbu	data2, 0(src2)
+	addi	src1, src1, 1
+	addi	src2, src2, 1
+	bne	data1, data2, L(sub)
+	bnez	data1, L(simpleloop)
+
+L(sub):
+	sub	result, data1, data2
+	ret
+
+.option pop
+
+END (STRCMP)
+libc_hidden_builtin_def (STRCMP)
diff --git a/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S b/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
new file mode 100644
index 0000000000..ec21982b65
--- /dev/null
+++ b/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
@@ -0,0 +1,213 @@ 
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+/* Assumptions: rvi_zbb with fast unaligned access.  */
+/* Implementation inspired by aarch64/strcmp.S.  */
+
+#define src1		a0
+#define result		a0
+#define src2		a1
+#define off		a3
+#define m1		a4
+#define align1		a5
+#define src3		a6
+#define tmp		a7
+
+#define data1		t0
+#define data2		t1
+#define b1		t0
+#define b2		t1
+#define data3		t2
+#define data1_orcb	t3
+#define data3_orcb	t4
+#define shift		t5
+
+#if __riscv_xlen == 64
+# define REG_L	ld
+# define SZREG	8
+# define PTRLOG	3
+#else
+# define REG_L	lw
+# define SZREG	4
+# define PTRLOG	2
+#endif
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+# error big endian is untested!
+# define CZ	ctz
+# define SHIFT	srl
+# define SHIFT2	sll
+#else
+# define CZ	ctz
+# define SHIFT	sll
+# define SHIFT2	srl
+#endif
+
+#ifndef STRCMP
+# define STRCMP __strcmp_zbb_unaligned
+#endif
+
+.option push
+.option arch,+zbb
+
+ENTRY_ALIGN (STRCMP, 6)
+	/* off...delta from src1 to src2.  */
+	sub	off, src2, src1
+	li	m1, -1
+	andi	tmp, off, SZREG-1
+	andi	align1, src1, SZREG-1
+	bnez	tmp, L(misaligned8)
+	bnez	align1, L(mutual_align)
+
+	.p2align 4
+L(loop_aligned):
+	REG_L	data1, 0(src1)
+	add	tmp, src1, off
+	addi	src1, src1, SZREG
+	REG_L	data2, 0(tmp)
+
+L(start_realigned):
+	orc.b	data1_orcb, data1
+	bne	data1_orcb, m1, L(end)
+	beq	data1, data2, L(loop_aligned)
+
+L(fast_end):
+	/* Words don't match, and no NUL byte in one word.
+	   Get bytes in big-endian order and compare as words.  */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	rev8	data1, data1
+	rev8	data2, data2
+#endif
+	/* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence.  */
+	sltu	result, data1, data2
+	neg	result, result
+	ori	result, result, 1
+	ret
+
+L(end_orc):
+	orc.b	data1_orcb, data1
+L(end):
+	/* Words don't match or NUL byte in at least one word.
+	   data1_orcb holds orc.b value of data1.  */
+	xor	tmp, data1, data2
+	orc.b	tmp, tmp
+
+	orn	tmp, tmp, data1_orcb
+	CZ	shift, tmp
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	rev8	data1, data1
+	rev8	data2, data2
+#endif
+	sll	data1, data1, shift
+	sll	data2, data2, shift
+	srl	b1, data1, SZREG*8-8
+	srl	b2, data2, SZREG*8-8
+
+L(end_singlebyte):
+	sub	result, b1, b2
+	ret
+
+	.p2align 4
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.  */
+	andi	src1, src1, -SZREG
+	add	tmp, src1, off
+	REG_L	data1, 0(src1)
+	addi	src1, src1, SZREG
+	REG_L	data2, 0(tmp)
+	/* Get number of bits to mask.  */
+	sll	shift, src2, 3
+	/* Bits to mask are now 0, others are 1.  */
+	SHIFT	tmp, m1, shift
+	/* Or with inverted value -> masked bits become 1.  */
+	orn	data1, data1, tmp
+	orn	data2, data2, tmp
+	j	L(start_realigned)
+
+L(misaligned8):
+	/* Skip slow loop if SRC1 is aligned.  */
+	beqz	align1, L(src1_aligned)
+L(do_misaligned):
+	/* Align SRC1 to 8 bytes.  */
+	lbu	b1, 0(src1)
+	lbu	b2, 0(src2)
+	beqz	b1, L(end_singlebyte)
+	bne	b1, b2, L(end_singlebyte)
+	addi	src1, src1, 1
+	addi	src2, src2, 1
+	andi	align1, src1, SZREG-1
+	bnez	align1, L(do_misaligned)
+
+L(src1_aligned):
+	/* SRC1 is aligned. Align SRC2 down and check for NUL there.
+	 * If there is no NUL, we may read the next word from SRC2.
+	 * If there is a NUL, we must not read a complete word from SRC2
+	 * because we might cross a page boundary.  */
+	/* Get number of bits to mask (upper bits are ignored by shifts).  */
+	sll	shift, src2, 3
+	/* src3 := align_down (src2)  */
+	andi	src3, src2, -SZREG
+	REG_L   data3, 0(src3)
+	addi	src3, src3, SZREG
+
+	/* Bits to mask are now 0, others are 1.  */
+	SHIFT	tmp, m1, shift
+	/* Or with inverted value -> masked bits become 1.  */
+	orn	data3_orcb, data3, tmp
+	/* Check for NUL in next aligned word.  */
+	orc.b	data3_orcb, data3_orcb
+	bne	data3_orcb, m1, L(unaligned_nul)
+
+	.p2align 4
+L(loop_unaligned):
+	/* Read the (aligned) data1 and the unaligned data2.  */
+	REG_L	data1, 0(src1)
+	addi	src1, src1, SZREG
+	REG_L	data2, 0(src2)
+	addi	src2, src2, SZREG
+	orc.b	data1_orcb, data1
+	bne	data1_orcb, m1, L(end)
+	bne	data1, data2, L(end)
+
+	/* Read the next aligned-down word.  */
+	REG_L	data3, 0(src3)
+	addi	src3, src3, SZREG
+	orc.b	data3_orcb, data3
+	beq	data3_orcb, m1, L(loop_unaligned)
+
+L(unaligned_nul):
+	/* src1 points to unread word (only first bytes relevant).
+	 * data3 holds next aligned-down word with NUL.
+	 * Compare the first bytes of data1 with the last bytes of data3.  */
+	REG_L	data1, 0(src1)
+	/* Shift NUL bytes into data3 to become data2.  */
+	SHIFT2	data2, data3, shift
+	bne	data1, data2, L(end_orc)
+	li	result, 0
+	ret
+
+.option pop
+
+END (STRCMP)
+libc_hidden_builtin_def (STRCMP)