[RFC,18/19] riscv: Add an optimized strncmp routine

Message ID 20230207001618.458947-19-christoph.muellner@vrull.eu
State New
Headers
Series riscv: ifunc support with optimized mem*/str*/cpu_relax routines |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Christoph Müllner Feb. 7, 2023, 12:16 a.m. UTC
  From: Christoph Müllner <christoph.muellner@vrull.eu>

The implementation of strncmp() can be accelerated using Zbb's orc.b
instruction. Let's add an optimized implementation that makes use
of this instruction.

Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
---
 sysdeps/riscv/multiarch/Makefile          |   3 +-
 sysdeps/riscv/multiarch/ifunc-impl-list.c |   1 +
 sysdeps/riscv/multiarch/strncmp.c         |   6 +-
 sysdeps/riscv/multiarch/strncmp_zbb.S     | 119 ++++++++++++++++++++++
 4 files changed, 127 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S
  

Comments

Noah Goldstein Feb. 7, 2023, 1:19 a.m. UTC | #1
On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner
<christoph.muellner@vrull.eu> wrote:
>
> From: Christoph Müllner <christoph.muellner@vrull.eu>
>
> The implementation of strncmp() can be accelerated using Zbb's orc.b
> instruction. Let's add an optimized implementation that makes use
> of this instruction.
>
> Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>

Not necessary, but imo performance patches should have at least some reference
to the expected speedup versus the existing alternatives.
> ---
>  sysdeps/riscv/multiarch/Makefile          |   3 +-
>  sysdeps/riscv/multiarch/ifunc-impl-list.c |   1 +
>  sysdeps/riscv/multiarch/strncmp.c         |   6 +-
>  sysdeps/riscv/multiarch/strncmp_zbb.S     | 119 ++++++++++++++++++++++
>  4 files changed, 127 insertions(+), 2 deletions(-)
>  create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S
>
> diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile
> index 056ce2ffc0..9f22e31b99 100644
> --- a/sysdeps/riscv/multiarch/Makefile
> +++ b/sysdeps/riscv/multiarch/Makefile
> @@ -14,5 +14,6 @@ sysdep_routines += \
>         strcmp_generic \
>         strcmp_zbb \
>         strcmp_zbb_unaligned \
> -       strncmp_generic
> +       strncmp_generic \
> +       strncmp_zbb
>  endif
> diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> index eb37ed6017..82fd34d010 100644
> --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
>
>    IFUNC_IMPL (i, name, strncmp,
> +             IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb)
>               IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic))
>    return i;
>  }
> diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/strncmp.c
> index 970aeb8b85..5b0fe08e98 100644
> --- a/sysdeps/riscv/multiarch/strncmp.c
> +++ b/sysdeps/riscv/multiarch/strncmp.c
> @@ -30,8 +30,12 @@
>
>  extern __typeof (__redirect_strncmp) __libc_strncmp;
>  extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden;
> +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden;
>
> -libc_ifunc (__libc_strncmp, __strncmp_generic);
> +libc_ifunc (__libc_strncmp,
> +           HAVE_RV(zbb)
> +           ? __strncmp_zbb
> +           : __strncmp_generic);
>
>  # undef strncmp
>  strong_alias (__libc_strncmp, strncmp);
> diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multiarch/strncmp_zbb.S
> new file mode 100644
> index 0000000000..29cff30def
> --- /dev/null
> +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S
> @@ -0,0 +1,119 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <sys/asm.h>
> +
> +/* Assumptions: rvi_zbb.  */
> +
> +#define src1           a0
> +#define result         a0
> +#define src2           a1
> +#define len            a2
> +#define data1          a2
> +#define data2          a3
> +#define align          a4
> +#define data1_orcb     t0
> +#define limit          t1
> +#define fast_limit      t2
> +#define m1             t3
> +
> +#if __riscv_xlen == 64
> +# define REG_L ld
> +# define SZREG 8
> +# define PTRLOG        3
> +#else
> +# define REG_L lw
> +# define SZREG 4
> +# define PTRLOG        2
> +#endif
> +
> +#ifndef STRNCMP
> +# define STRNCMP __strncmp_zbb
> +#endif
> +
> +.option push
> +.option arch,+zbb
> +
> +ENTRY_ALIGN (STRNCMP, 6)
> +       beqz    len, L(equal)
> +       or      align, src1, src2
> +       and     align, align, SZREG-1
> +       add     limit, src1, len
> +       bnez    align, L(simpleloop)
> +       li      m1, -1
> +
> +       /* Adjust limit for fast-path.  */
> +       andi fast_limit, limit, -SZREG
> +
> +       /* Main loop for aligned string.  */
> +       .p2align 3
> +L(loop):
> +       bge     src1, fast_limit, L(simpleloop)
> +       REG_L   data1, 0(src1)
> +       REG_L   data2, 0(src2)
> +       orc.b   data1_orcb, data1
> +       bne     data1_orcb, m1, L(foundnull)
> +       addi    src1, src1, SZREG
> +       addi    src2, src2, SZREG
> +       beq     data1, data2, L(loop)
> +
> +       /* Words don't match, and no null byte in the first
> +        * word. Get bytes in big-endian order and compare.  */
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> +       rev8    data1, data1
> +       rev8    data2, data2
> +#endif
> +       /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence.  */
> +       sltu    result, data1, data2
> +       neg     result, result
> +       ori     result, result, 1
> +       ret
> +
> +L(foundnull):
> +       /* Found a null byte.
> +        * If words don't match, fall back to simple loop.  */
> +       bne     data1, data2, L(simpleloop)
> +
> +       /* Otherwise, strings are equal.  */
> +       li      result, 0
> +       ret
> +
> +       /* Simple loop for misaligned strings.  */
> +       .p2align 3
> +L(simpleloop):
> +       bge     src1, limit, L(equal)
> +       lbu     data1, 0(src1)
> +       addi    src1, src1, 1
> +       lbu     data2, 0(src2)
> +       addi    src2, src2, 1
> +       bne     data1, data2, L(sub)
> +       bnez    data1, L(simpleloop)
> +
> +L(sub):
> +       sub     result, data1, data2
> +       ret
> +
> +L(equal):
> +       li      result, 0
> +       ret
> +
> +.option pop
> +
> +END (STRNCMP)
> +libc_hidden_builtin_def (STRNCMP)
> --
> 2.39.1
>
  
Philipp Tomsich Feb. 8, 2023, 3:13 p.m. UTC | #2
On Tue, 7 Feb 2023 at 02:20, Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner
> <christoph.muellner@vrull.eu> wrote:
> >
> > From: Christoph Müllner <christoph.muellner@vrull.eu>
> >
> > The implementation of strncmp() can be accelerated using Zbb's orc.b
> > instruction. Let's add an optimized implementation that makes use
> > of this instruction.
> >
> > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
>
> Not necessary, but imo performance patches should have at least some reference
> to the expected speedup versus the existing alternatives.

Given that this is effectively a SWAR-like optimization (orc.b allows
us to test 8 bytes in parallel for a NUL byte), we should be able to
show the benefit through a reduction in dynamic instructions.  Would
this be considered reasonable reference data?

> > ---
> >  sysdeps/riscv/multiarch/Makefile          |   3 +-
> >  sysdeps/riscv/multiarch/ifunc-impl-list.c |   1 +
> >  sysdeps/riscv/multiarch/strncmp.c         |   6 +-
> >  sysdeps/riscv/multiarch/strncmp_zbb.S     | 119 ++++++++++++++++++++++
> >  4 files changed, 127 insertions(+), 2 deletions(-)
> >  create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S
> >
> > diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile
> > index 056ce2ffc0..9f22e31b99 100644
> > --- a/sysdeps/riscv/multiarch/Makefile
> > +++ b/sysdeps/riscv/multiarch/Makefile
> > @@ -14,5 +14,6 @@ sysdep_routines += \
> >         strcmp_generic \
> >         strcmp_zbb \
> >         strcmp_zbb_unaligned \
> > -       strncmp_generic
> > +       strncmp_generic \
> > +       strncmp_zbb
> >  endif
> > diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > index eb37ed6017..82fd34d010 100644
> > --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >               IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
> >
> >    IFUNC_IMPL (i, name, strncmp,
> > +             IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb)
> >               IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic))
> >    return i;
> >  }
> > diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/strncmp.c
> > index 970aeb8b85..5b0fe08e98 100644
> > --- a/sysdeps/riscv/multiarch/strncmp.c
> > +++ b/sysdeps/riscv/multiarch/strncmp.c
> > @@ -30,8 +30,12 @@
> >
> >  extern __typeof (__redirect_strncmp) __libc_strncmp;
> >  extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden;
> > +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden;
> >
> > -libc_ifunc (__libc_strncmp, __strncmp_generic);
> > +libc_ifunc (__libc_strncmp,
> > +           HAVE_RV(zbb)
> > +           ? __strncmp_zbb
> > +           : __strncmp_generic);
> >
> >  # undef strncmp
> >  strong_alias (__libc_strncmp, strncmp);
> > diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multiarch/strncmp_zbb.S
> > new file mode 100644
> > index 0000000000..29cff30def
> > --- /dev/null
> > +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S
> > @@ -0,0 +1,119 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <sysdep.h>
> > +#include <sys/asm.h>
> > +
> > +/* Assumptions: rvi_zbb.  */
> > +
> > +#define src1           a0
> > +#define result         a0
> > +#define src2           a1
> > +#define len            a2
> > +#define data1          a2
> > +#define data2          a3
> > +#define align          a4
> > +#define data1_orcb     t0
> > +#define limit          t1
> > +#define fast_limit      t2
> > +#define m1             t3
> > +
> > +#if __riscv_xlen == 64
> > +# define REG_L ld
> > +# define SZREG 8
> > +# define PTRLOG        3
> > +#else
> > +# define REG_L lw
> > +# define SZREG 4
> > +# define PTRLOG        2
> > +#endif
> > +
> > +#ifndef STRNCMP
> > +# define STRNCMP __strncmp_zbb
> > +#endif
> > +
> > +.option push
> > +.option arch,+zbb
> > +
> > +ENTRY_ALIGN (STRNCMP, 6)
> > +       beqz    len, L(equal)
> > +       or      align, src1, src2
> > +       and     align, align, SZREG-1
> > +       add     limit, src1, len
> > +       bnez    align, L(simpleloop)
> > +       li      m1, -1
> > +
> > +       /* Adjust limit for fast-path.  */
> > +       andi fast_limit, limit, -SZREG
> > +
> > +       /* Main loop for aligned string.  */
> > +       .p2align 3
> > +L(loop):
> > +       bge     src1, fast_limit, L(simpleloop)
> > +       REG_L   data1, 0(src1)
> > +       REG_L   data2, 0(src2)
> > +       orc.b   data1_orcb, data1
> > +       bne     data1_orcb, m1, L(foundnull)
> > +       addi    src1, src1, SZREG
> > +       addi    src2, src2, SZREG
> > +       beq     data1, data2, L(loop)
> > +
> > +       /* Words don't match, and no null byte in the first
> > +        * word. Get bytes in big-endian order and compare.  */
> > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > +       rev8    data1, data1
> > +       rev8    data2, data2
> > +#endif
> > +       /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence.  */
> > +       sltu    result, data1, data2
> > +       neg     result, result
> > +       ori     result, result, 1
> > +       ret
> > +
> > +L(foundnull):
> > +       /* Found a null byte.
> > +        * If words don't match, fall back to simple loop.  */
> > +       bne     data1, data2, L(simpleloop)
> > +
> > +       /* Otherwise, strings are equal.  */
> > +       li      result, 0
> > +       ret
> > +
> > +       /* Simple loop for misaligned strings.  */
> > +       .p2align 3
> > +L(simpleloop):
> > +       bge     src1, limit, L(equal)
> > +       lbu     data1, 0(src1)
> > +       addi    src1, src1, 1
> > +       lbu     data2, 0(src2)
> > +       addi    src2, src2, 1
> > +       bne     data1, data2, L(sub)
> > +       bnez    data1, L(simpleloop)
> > +
> > +L(sub):
> > +       sub     result, data1, data2
> > +       ret
> > +
> > +L(equal):
> > +       li      result, 0
> > +       ret
> > +
> > +.option pop
> > +
> > +END (STRNCMP)
> > +libc_hidden_builtin_def (STRNCMP)
> > --
> > 2.39.1
> >
  
Palmer Dabbelt Feb. 8, 2023, 5:55 p.m. UTC | #3
On Wed, 08 Feb 2023 07:13:44 PST (-0800), philipp.tomsich@vrull.eu wrote:
> On Tue, 7 Feb 2023 at 02:20, Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner
>> <christoph.muellner@vrull.eu> wrote:
>> >
>> > From: Christoph Müllner <christoph.muellner@vrull.eu>
>> >
>> > The implementation of strncmp() can be accelerated using Zbb's orc.b
>> > instruction. Let's add an optimized implementation that makes use
>> > of this instruction.
>> >
>> > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
>>
>> Not necessary, but imo performance patches should have at least some reference
>> to the expected speedup versus the existing alternatives.
>
> Given that this is effectively a SWAR-like optimization (orc.b allows
> us to test 8 bytes in parallel for a NUL byte), we should be able to
> show the benefit through a reduction in dynamic instructions.  Would
> this be considered reasonable reference data?

Generally for performance improvements the only metrics that count come 
from real hardware.  Processor implementation is complex and it's not 
generally true that reducing dynamic instructions results in better 
performance (particularly when more complex flavors instructions replace 
simpler ones).

We've not been so good about this on the RISC-V side of things, though.  
I think that's largely because we didn't have all that much complexity 
around this, but there's a ton of stuff showing up right now.  The 
general theory has been that Zbb instructions will execute faster than 
their corresponding I sequences, but nobody has proved that.  I believe 
the new JH7110 has Zba and Zbb, so maybe the right answer there is to 
just benchmark things before merging them?  That way we can get back to 
doing things sanely before we go too far down the premature optimization 
rabbit hole.

FWIW: we had a pretty similar discussion in Linux land around these and 
nobody could get the JH7110 to boot, but given that we have ~6 months 
until glibc releases again hopefully that will be sorted out.  There's a 
bunch of ongoing work looking at the more core issues like probing, so 
maybe it's best to focus on getting that all sorted out first?  It's 
kind of awkward to have a bunch of routines posted in a whole new 
framework that's not sorting out all the probing dependencies.

>> > ---
>> >  sysdeps/riscv/multiarch/Makefile          |   3 +-
>> >  sysdeps/riscv/multiarch/ifunc-impl-list.c |   1 +
>> >  sysdeps/riscv/multiarch/strncmp.c         |   6 +-
>> >  sysdeps/riscv/multiarch/strncmp_zbb.S     | 119 ++++++++++++++++++++++
>> >  4 files changed, 127 insertions(+), 2 deletions(-)
>> >  create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S
>> >
>> > diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile
>> > index 056ce2ffc0..9f22e31b99 100644
>> > --- a/sysdeps/riscv/multiarch/Makefile
>> > +++ b/sysdeps/riscv/multiarch/Makefile
>> > @@ -14,5 +14,6 @@ sysdep_routines += \
>> >         strcmp_generic \
>> >         strcmp_zbb \
>> >         strcmp_zbb_unaligned \
>> > -       strncmp_generic
>> > +       strncmp_generic \
>> > +       strncmp_zbb
>> >  endif
>> > diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c
>> > index eb37ed6017..82fd34d010 100644
>> > --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
>> > +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
>> > @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> >               IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
>> >
>> >    IFUNC_IMPL (i, name, strncmp,
>> > +             IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb)
>> >               IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic))
>> >    return i;
>> >  }
>> > diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/strncmp.c
>> > index 970aeb8b85..5b0fe08e98 100644
>> > --- a/sysdeps/riscv/multiarch/strncmp.c
>> > +++ b/sysdeps/riscv/multiarch/strncmp.c
>> > @@ -30,8 +30,12 @@
>> >
>> >  extern __typeof (__redirect_strncmp) __libc_strncmp;
>> >  extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden;
>> > +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden;
>> >
>> > -libc_ifunc (__libc_strncmp, __strncmp_generic);
>> > +libc_ifunc (__libc_strncmp,
>> > +           HAVE_RV(zbb)
>> > +           ? __strncmp_zbb
>> > +           : __strncmp_generic);
>> >
>> >  # undef strncmp
>> >  strong_alias (__libc_strncmp, strncmp);
>> > diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multiarch/strncmp_zbb.S
>> > new file mode 100644
>> > index 0000000000..29cff30def
>> > --- /dev/null
>> > +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S
>> > @@ -0,0 +1,119 @@
>> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
>> > +
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library.  If not, see
>> > +   <https://www.gnu.org/licenses/>.  */
>> > +
>> > +#include <sysdep.h>
>> > +#include <sys/asm.h>
>> > +
>> > +/* Assumptions: rvi_zbb.  */
>> > +
>> > +#define src1           a0
>> > +#define result         a0
>> > +#define src2           a1
>> > +#define len            a2
>> > +#define data1          a2
>> > +#define data2          a3
>> > +#define align          a4
>> > +#define data1_orcb     t0
>> > +#define limit          t1
>> > +#define fast_limit      t2
>> > +#define m1             t3
>> > +
>> > +#if __riscv_xlen == 64
>> > +# define REG_L ld
>> > +# define SZREG 8
>> > +# define PTRLOG        3
>> > +#else
>> > +# define REG_L lw
>> > +# define SZREG 4
>> > +# define PTRLOG        2
>> > +#endif
>> > +
>> > +#ifndef STRNCMP
>> > +# define STRNCMP __strncmp_zbb
>> > +#endif
>> > +
>> > +.option push
>> > +.option arch,+zbb
>> > +
>> > +ENTRY_ALIGN (STRNCMP, 6)
>> > +       beqz    len, L(equal)
>> > +       or      align, src1, src2
>> > +       and     align, align, SZREG-1
>> > +       add     limit, src1, len
>> > +       bnez    align, L(simpleloop)
>> > +       li      m1, -1
>> > +
>> > +       /* Adjust limit for fast-path.  */
>> > +       andi fast_limit, limit, -SZREG
>> > +
>> > +       /* Main loop for aligned string.  */
>> > +       .p2align 3
>> > +L(loop):
>> > +       bge     src1, fast_limit, L(simpleloop)
>> > +       REG_L   data1, 0(src1)
>> > +       REG_L   data2, 0(src2)
>> > +       orc.b   data1_orcb, data1
>> > +       bne     data1_orcb, m1, L(foundnull)
>> > +       addi    src1, src1, SZREG
>> > +       addi    src2, src2, SZREG
>> > +       beq     data1, data2, L(loop)
>> > +
>> > +       /* Words don't match, and no null byte in the first
>> > +        * word. Get bytes in big-endian order and compare.  */
>> > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
>> > +       rev8    data1, data1
>> > +       rev8    data2, data2
>> > +#endif
>> > +       /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence.  */
>> > +       sltu    result, data1, data2
>> > +       neg     result, result
>> > +       ori     result, result, 1
>> > +       ret
>> > +
>> > +L(foundnull):
>> > +       /* Found a null byte.
>> > +        * If words don't match, fall back to simple loop.  */
>> > +       bne     data1, data2, L(simpleloop)
>> > +
>> > +       /* Otherwise, strings are equal.  */
>> > +       li      result, 0
>> > +       ret
>> > +
>> > +       /* Simple loop for misaligned strings.  */
>> > +       .p2align 3
>> > +L(simpleloop):
>> > +       bge     src1, limit, L(equal)
>> > +       lbu     data1, 0(src1)
>> > +       addi    src1, src1, 1
>> > +       lbu     data2, 0(src2)
>> > +       addi    src2, src2, 1
>> > +       bne     data1, data2, L(sub)
>> > +       bnez    data1, L(simpleloop)
>> > +
>> > +L(sub):
>> > +       sub     result, data1, data2
>> > +       ret
>> > +
>> > +L(equal):
>> > +       li      result, 0
>> > +       ret
>> > +
>> > +.option pop
>> > +
>> > +END (STRNCMP)
>> > +libc_hidden_builtin_def (STRNCMP)
>> > --
>> > 2.39.1
>> >
  
Noah Goldstein Feb. 8, 2023, 6:04 p.m. UTC | #4
On Wed, Feb 8, 2023 at 9:13 AM Philipp Tomsich <philipp.tomsich@vrull.eu> wrote:
>
> On Tue, 7 Feb 2023 at 02:20, Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner
> > <christoph.muellner@vrull.eu> wrote:
> > >
> > > From: Christoph Müllner <christoph.muellner@vrull.eu>
> > >
> > > The implementation of strncmp() can be accelerated using Zbb's orc.b
> > > instruction. Let's add an optimized implementation that makes use
> > > of this instruction.
> > >
> > > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
> >
> > Not necessary, but imo performance patches should have at least some reference
> > to the expected speedup versus the existing alternatives.
>
> Given that this is effectively a SWAR-like optimization (orc.b allows
> us to test 8 bytes in parallel for a NUL byte), we should be able to
> show the benefit through a reduction in dynamic instructions.  Would
> this be considered reasonable reference data?
>

GLIBC has a benchmark suite for all the string/memory functions so
would expect improvement in those results compared to the generic
implementations.


> > > ---
> > >  sysdeps/riscv/multiarch/Makefile          |   3 +-
> > >  sysdeps/riscv/multiarch/ifunc-impl-list.c |   1 +
> > >  sysdeps/riscv/multiarch/strncmp.c         |   6 +-
> > >  sysdeps/riscv/multiarch/strncmp_zbb.S     | 119 ++++++++++++++++++++++
> > >  4 files changed, 127 insertions(+), 2 deletions(-)
> > >  create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S
> > >
> > > diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile
> > > index 056ce2ffc0..9f22e31b99 100644
> > > --- a/sysdeps/riscv/multiarch/Makefile
> > > +++ b/sysdeps/riscv/multiarch/Makefile
> > > @@ -14,5 +14,6 @@ sysdep_routines += \
> > >         strcmp_generic \
> > >         strcmp_zbb \
> > >         strcmp_zbb_unaligned \
> > > -       strncmp_generic
> > > +       strncmp_generic \
> > > +       strncmp_zbb
> > >  endif
> > > diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > > index eb37ed6017..82fd34d010 100644
> > > --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > > @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >               IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
> > >
> > >    IFUNC_IMPL (i, name, strncmp,
> > > +             IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb)
> > >               IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic))
> > >    return i;
> > >  }
> > > diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/strncmp.c
> > > index 970aeb8b85..5b0fe08e98 100644
> > > --- a/sysdeps/riscv/multiarch/strncmp.c
> > > +++ b/sysdeps/riscv/multiarch/strncmp.c
> > > @@ -30,8 +30,12 @@
> > >
> > >  extern __typeof (__redirect_strncmp) __libc_strncmp;
> > >  extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden;
> > > +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden;
> > >
> > > -libc_ifunc (__libc_strncmp, __strncmp_generic);
> > > +libc_ifunc (__libc_strncmp,
> > > +           HAVE_RV(zbb)
> > > +           ? __strncmp_zbb
> > > +           : __strncmp_generic);
> > >
> > >  # undef strncmp
> > >  strong_alias (__libc_strncmp, strncmp);
> > > diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multiarch/strncmp_zbb.S
> > > new file mode 100644
> > > index 0000000000..29cff30def
> > > --- /dev/null
> > > +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S
> > > @@ -0,0 +1,119 @@
> > > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > > +
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library.  If not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#include <sysdep.h>
> > > +#include <sys/asm.h>
> > > +
> > > +/* Assumptions: rvi_zbb.  */
> > > +
> > > +#define src1           a0
> > > +#define result         a0
> > > +#define src2           a1
> > > +#define len            a2
> > > +#define data1          a2
> > > +#define data2          a3
> > > +#define align          a4
> > > +#define data1_orcb     t0
> > > +#define limit          t1
> > > +#define fast_limit      t2
> > > +#define m1             t3
> > > +
> > > +#if __riscv_xlen == 64
> > > +# define REG_L ld
> > > +# define SZREG 8
> > > +# define PTRLOG        3
> > > +#else
> > > +# define REG_L lw
> > > +# define SZREG 4
> > > +# define PTRLOG        2
> > > +#endif
> > > +
> > > +#ifndef STRNCMP
> > > +# define STRNCMP __strncmp_zbb
> > > +#endif
> > > +
> > > +.option push
> > > +.option arch,+zbb
> > > +
> > > +ENTRY_ALIGN (STRNCMP, 6)
> > > +       beqz    len, L(equal)
> > > +       or      align, src1, src2
> > > +       and     align, align, SZREG-1
> > > +       add     limit, src1, len
> > > +       bnez    align, L(simpleloop)
> > > +       li      m1, -1
> > > +
> > > +       /* Adjust limit for fast-path.  */
> > > +       andi fast_limit, limit, -SZREG
> > > +
> > > +       /* Main loop for aligned string.  */
> > > +       .p2align 3
> > > +L(loop):
> > > +       bge     src1, fast_limit, L(simpleloop)
> > > +       REG_L   data1, 0(src1)
> > > +       REG_L   data2, 0(src2)
> > > +       orc.b   data1_orcb, data1
> > > +       bne     data1_orcb, m1, L(foundnull)
> > > +       addi    src1, src1, SZREG
> > > +       addi    src2, src2, SZREG
> > > +       beq     data1, data2, L(loop)
> > > +
> > > +       /* Words don't match, and no null byte in the first
> > > +        * word. Get bytes in big-endian order and compare.  */
> > > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > > +       rev8    data1, data1
> > > +       rev8    data2, data2
> > > +#endif
> > > +       /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence.  */
> > > +       sltu    result, data1, data2
> > > +       neg     result, result
> > > +       ori     result, result, 1
> > > +       ret
> > > +
> > > +L(foundnull):
> > > +       /* Found a null byte.
> > > +        * If words don't match, fall back to simple loop.  */
> > > +       bne     data1, data2, L(simpleloop)
> > > +
> > > +       /* Otherwise, strings are equal.  */
> > > +       li      result, 0
> > > +       ret
> > > +
> > > +       /* Simple loop for misaligned strings.  */
> > > +       .p2align 3
> > > +L(simpleloop):
> > > +       bge     src1, limit, L(equal)
> > > +       lbu     data1, 0(src1)
> > > +       addi    src1, src1, 1
> > > +       lbu     data2, 0(src2)
> > > +       addi    src2, src2, 1
> > > +       bne     data1, data2, L(sub)
> > > +       bnez    data1, L(simpleloop)
> > > +
> > > +L(sub):
> > > +       sub     result, data1, data2
> > > +       ret
> > > +
> > > +L(equal):
> > > +       li      result, 0
> > > +       ret
> > > +
> > > +.option pop
> > > +
> > > +END (STRNCMP)
> > > +libc_hidden_builtin_def (STRNCMP)
> > > --
> > > 2.39.1
> > >
  
Adhemerval Zanella Feb. 8, 2023, 7:48 p.m. UTC | #5
On 08/02/23 14:55, Palmer Dabbelt wrote:
> On Wed, 08 Feb 2023 07:13:44 PST (-0800), philipp.tomsich@vrull.eu wrote:
>> On Tue, 7 Feb 2023 at 02:20, Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>>
>>> On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner
>>> <christoph.muellner@vrull.eu> wrote:
>>> >
>>> > From: Christoph Müllner <christoph.muellner@vrull.eu>
>>> >
>>> > The implementation of strncmp() can be accelerated using Zbb's orc.b
>>> > instruction. Let's add an optimized implementation that makes use
>>> > of this instruction.
>>> >
>>> > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
>>>
>>> Not necessary, but imo performance patches should have at least some reference
>>> to the expected speedup versus the existing alternatives.
>>
>> Given that this is effectively a SWAR-like optimization (orc.b allows
>> us to test 8 bytes in parallel for a NUL byte), we should be able to
>> show the benefit through a reduction in dynamic instructions.  Would
>> this be considered reasonable reference data?
> 
> Generally for performance improvements the only metrics that count come from real hardware.  Processor implementation is complex and it's not generally true that reducing dynamic instructions results in better performance (particularly when more complex flavors instructions replace simpler ones).
> 

I agree with Noah here that we need to have some baseline performance number, 
even tough we are comparing naive implementations (what glibc used to have for
implementations).

> We've not been so good about this on the RISC-V side of things, though.  I think that's largely because we didn't have all that much complexity around this, but there's a ton of stuff showing up right now.  The general theory has been that Zbb instructions will execute faster than their corresponding I sequences, but nobody has proved that.  I believe the new JH7110 has Zba and Zbb, so maybe the right answer there is to just benchmark things before merging them?  That way we can get back to doing things sanely before we go too far down the premature optimization rabbit hole.
> 
> FWIW: we had a pretty similar discussion in Linux land around these and nobody could get the JH7110 to boot, but given that we have ~6 months until glibc releases again hopefully that will be sorted out.  There's a bunch of ongoing work looking at the more core issues like probing, so maybe it's best to focus on getting that all sorted out first?  It's kind of awkward to have a bunch of routines posted in a whole new framework that's not sorting out all the probing dependencies.

Just a heads up that with latest generic string routines optimization, all
str* routines should now use new zbb extensions (if compiler is instructed
to do so). I think you might squeeze some cycles with hand crafted assembly
routine, but I would rather focus on trying to optimize code generation
instead.

The generic routines still assumes that hardware can't or is prohibitive 
expensive to issue unaligned memory access.  However, I think we move toward 
this direction to start adding unaligned variants when it makes sense.

Another usual tuning is loop unrolling, which depends on underlying hardware.
Unfortunately we need to explicit force gcc to unroll some loop construction
(for instance check sysdeps/powerpc/powerpc64/power4/Makefile), so this might
be another approach you might use to tune RISCV routines.

The memcpy, memmove, memset, memcmp are a slight different subject.  Although
current generic mem routines does use some explicit unrolling, it also does
not take in consideration unaligned access, vector instructions, or special 
instruction (such as cache clear one).  And these usually make a lot of
difference.

What I would expect it maybe we can use a similar strategy Google is doing
with llvm libc, which based its work on the automemcpy paper [1]. It means
that for unaligned, each architecture will reimplement the memory routine
block.  Although the project focus on static compiling, I think using hooks
over assembly routines might be a better approach (you might reuse code
blocks or try different strategies more easily).

[1] https://storage.googleapis.com/pub-tools-public-publication-data/pdf/4f7c3da72d557ed418828823a8e59942859d677f.pdf
  

Patch

diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile
index 056ce2ffc0..9f22e31b99 100644
--- a/sysdeps/riscv/multiarch/Makefile
+++ b/sysdeps/riscv/multiarch/Makefile
@@ -14,5 +14,6 @@  sysdep_routines += \
 	strcmp_generic \
 	strcmp_zbb \
 	strcmp_zbb_unaligned \
-	strncmp_generic
+	strncmp_generic \
+	strncmp_zbb
 endif
diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c
index eb37ed6017..82fd34d010 100644
--- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
+++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
@@ -64,6 +64,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
 
   IFUNC_IMPL (i, name, strncmp,
+	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb)
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic))
   return i;
 }
diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/strncmp.c
index 970aeb8b85..5b0fe08e98 100644
--- a/sysdeps/riscv/multiarch/strncmp.c
+++ b/sysdeps/riscv/multiarch/strncmp.c
@@ -30,8 +30,12 @@ 
 
 extern __typeof (__redirect_strncmp) __libc_strncmp;
 extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden;
+extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden;
 
-libc_ifunc (__libc_strncmp, __strncmp_generic);
+libc_ifunc (__libc_strncmp,
+	    HAVE_RV(zbb)
+	    ? __strncmp_zbb
+	    : __strncmp_generic);
 
 # undef strncmp
 strong_alias (__libc_strncmp, strncmp);
diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multiarch/strncmp_zbb.S
new file mode 100644
index 0000000000..29cff30def
--- /dev/null
+++ b/sysdeps/riscv/multiarch/strncmp_zbb.S
@@ -0,0 +1,119 @@ 
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+/* Assumptions: rvi_zbb.  */
+
+#define src1		a0
+#define result		a0
+#define src2		a1
+#define len		a2
+#define data1		a2
+#define data2		a3
+#define align		a4
+#define data1_orcb	t0
+#define limit		t1
+#define fast_limit      t2
+#define m1		t3
+
+#if __riscv_xlen == 64
+# define REG_L	ld
+# define SZREG	8
+# define PTRLOG	3
+#else
+# define REG_L	lw
+# define SZREG	4
+# define PTRLOG	2
+#endif
+
+#ifndef STRNCMP
+# define STRNCMP __strncmp_zbb
+#endif
+
+.option push
+.option arch,+zbb
+
+ENTRY_ALIGN (STRNCMP, 6)
+	beqz	len, L(equal)
+	or	align, src1, src2
+	and	align, align, SZREG-1
+	add	limit, src1, len
+	bnez	align, L(simpleloop)
+	li	m1, -1
+
+	/* Adjust limit for fast-path.  */
+	andi fast_limit, limit, -SZREG
+
+	/* Main loop for aligned string.  */
+	.p2align 3
+L(loop):
+	bge	src1, fast_limit, L(simpleloop)
+	REG_L	data1, 0(src1)
+	REG_L	data2, 0(src2)
+	orc.b	data1_orcb, data1
+	bne	data1_orcb, m1, L(foundnull)
+	addi	src1, src1, SZREG
+	addi	src2, src2, SZREG
+	beq	data1, data2, L(loop)
+
+	/* Words don't match, and no null byte in the first
+	 * word. Get bytes in big-endian order and compare.  */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	rev8	data1, data1
+	rev8	data2, data2
+#endif
+	/* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence.  */
+	sltu	result, data1, data2
+	neg	result, result
+	ori	result, result, 1
+	ret
+
+L(foundnull):
+	/* Found a null byte.
+	 * If words don't match, fall back to simple loop.  */
+	bne	data1, data2, L(simpleloop)
+
+	/* Otherwise, strings are equal.  */
+	li	result, 0
+	ret
+
+	/* Simple loop for misaligned strings.  */
+	.p2align 3
+L(simpleloop):
+	bge	src1, limit, L(equal)
+	lbu	data1, 0(src1)
+	addi	src1, src1, 1
+	lbu	data2, 0(src2)
+	addi	src2, src2, 1
+	bne	data1, data2, L(sub)
+	bnez	data1, L(simpleloop)
+
+L(sub):
+	sub	result, data1, data2
+	ret
+
+L(equal):
+	li	result, 0
+	ret
+
+.option pop
+
+END (STRNCMP)
+libc_hidden_builtin_def (STRNCMP)