[RFC,18/19] riscv: Add an optimized strncmp routine
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
From: Christoph Müllner <christoph.muellner@vrull.eu>
The implementation of strncmp() can be accelerated using Zbb's orc.b
instruction. Let's add an optimized implementation that makes use
of this instruction.
Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
---
sysdeps/riscv/multiarch/Makefile | 3 +-
sysdeps/riscv/multiarch/ifunc-impl-list.c | 1 +
sysdeps/riscv/multiarch/strncmp.c | 6 +-
sysdeps/riscv/multiarch/strncmp_zbb.S | 119 ++++++++++++++++++++++
4 files changed, 127 insertions(+), 2 deletions(-)
create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S
Comments
On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner
<christoph.muellner@vrull.eu> wrote:
>
> From: Christoph Müllner <christoph.muellner@vrull.eu>
>
> The implementation of strncmp() can be accelerated using Zbb's orc.b
> instruction. Let's add an optimized implementation that makes use
> of this instruction.
>
> Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
Not necessary, but imo performance patches should have at least some reference
to the expected speedup versus the existing alternatives.
> ---
> sysdeps/riscv/multiarch/Makefile | 3 +-
> sysdeps/riscv/multiarch/ifunc-impl-list.c | 1 +
> sysdeps/riscv/multiarch/strncmp.c | 6 +-
> sysdeps/riscv/multiarch/strncmp_zbb.S | 119 ++++++++++++++++++++++
> 4 files changed, 127 insertions(+), 2 deletions(-)
> create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S
>
> diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile
> index 056ce2ffc0..9f22e31b99 100644
> --- a/sysdeps/riscv/multiarch/Makefile
> +++ b/sysdeps/riscv/multiarch/Makefile
> @@ -14,5 +14,6 @@ sysdep_routines += \
> strcmp_generic \
> strcmp_zbb \
> strcmp_zbb_unaligned \
> - strncmp_generic
> + strncmp_generic \
> + strncmp_zbb
> endif
> diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> index eb37ed6017..82fd34d010 100644
> --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
>
> IFUNC_IMPL (i, name, strncmp,
> + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb)
> IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic))
> return i;
> }
> diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/strncmp.c
> index 970aeb8b85..5b0fe08e98 100644
> --- a/sysdeps/riscv/multiarch/strncmp.c
> +++ b/sysdeps/riscv/multiarch/strncmp.c
> @@ -30,8 +30,12 @@
>
> extern __typeof (__redirect_strncmp) __libc_strncmp;
> extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden;
> +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden;
>
> -libc_ifunc (__libc_strncmp, __strncmp_generic);
> +libc_ifunc (__libc_strncmp,
> + HAVE_RV(zbb)
> + ? __strncmp_zbb
> + : __strncmp_generic);
>
> # undef strncmp
> strong_alias (__libc_strncmp, strncmp);
> diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multiarch/strncmp_zbb.S
> new file mode 100644
> index 0000000000..29cff30def
> --- /dev/null
> +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S
> @@ -0,0 +1,119 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include <sys/asm.h>
> +
> +/* Assumptions: rvi_zbb. */
> +
> +#define src1 a0
> +#define result a0
> +#define src2 a1
> +#define len a2
> +#define data1 a2
> +#define data2 a3
> +#define align a4
> +#define data1_orcb t0
> +#define limit t1
> +#define fast_limit t2
> +#define m1 t3
> +
> +#if __riscv_xlen == 64
> +# define REG_L ld
> +# define SZREG 8
> +# define PTRLOG 3
> +#else
> +# define REG_L lw
> +# define SZREG 4
> +# define PTRLOG 2
> +#endif
> +
> +#ifndef STRNCMP
> +# define STRNCMP __strncmp_zbb
> +#endif
> +
> +.option push
> +.option arch,+zbb
> +
> +ENTRY_ALIGN (STRNCMP, 6)
> + beqz len, L(equal)
> + or align, src1, src2
> + and align, align, SZREG-1
> + add limit, src1, len
> + bnez align, L(simpleloop)
> + li m1, -1
> +
> + /* Adjust limit for fast-path. */
> + andi fast_limit, limit, -SZREG
> +
> + /* Main loop for aligned string. */
> + .p2align 3
> +L(loop):
> + bge src1, fast_limit, L(simpleloop)
> + REG_L data1, 0(src1)
> + REG_L data2, 0(src2)
> + orc.b data1_orcb, data1
> + bne data1_orcb, m1, L(foundnull)
> + addi src1, src1, SZREG
> + addi src2, src2, SZREG
> + beq data1, data2, L(loop)
> +
> + /* Words don't match, and no null byte in the first
> + * word. Get bytes in big-endian order and compare. */
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> + rev8 data1, data1
> + rev8 data2, data2
> +#endif
> + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
> + sltu result, data1, data2
> + neg result, result
> + ori result, result, 1
> + ret
> +
> +L(foundnull):
> + /* Found a null byte.
> + * If words don't match, fall back to simple loop. */
> + bne data1, data2, L(simpleloop)
> +
> + /* Otherwise, strings are equal. */
> + li result, 0
> + ret
> +
> + /* Simple loop for misaligned strings. */
> + .p2align 3
> +L(simpleloop):
> + bge src1, limit, L(equal)
> + lbu data1, 0(src1)
> + addi src1, src1, 1
> + lbu data2, 0(src2)
> + addi src2, src2, 1
> + bne data1, data2, L(sub)
> + bnez data1, L(simpleloop)
> +
> +L(sub):
> + sub result, data1, data2
> + ret
> +
> +L(equal):
> + li result, 0
> + ret
> +
> +.option pop
> +
> +END (STRNCMP)
> +libc_hidden_builtin_def (STRNCMP)
> --
> 2.39.1
>
On Tue, 7 Feb 2023 at 02:20, Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner
> <christoph.muellner@vrull.eu> wrote:
> >
> > From: Christoph Müllner <christoph.muellner@vrull.eu>
> >
> > The implementation of strncmp() can be accelerated using Zbb's orc.b
> > instruction. Let's add an optimized implementation that makes use
> > of this instruction.
> >
> > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
>
> Not necessary, but imo performance patches should have at least some reference
> to the expected speedup versus the existing alternatives.
Given that this is effectively a SWAR-like optimization (orc.b allows
us to test 8 bytes in parallel for a NUL byte), we should be able to
show the benefit through a reduction in dynamic instructions. Would
this be considered reasonable reference data?
> > ---
> > sysdeps/riscv/multiarch/Makefile | 3 +-
> > sysdeps/riscv/multiarch/ifunc-impl-list.c | 1 +
> > sysdeps/riscv/multiarch/strncmp.c | 6 +-
> > sysdeps/riscv/multiarch/strncmp_zbb.S | 119 ++++++++++++++++++++++
> > 4 files changed, 127 insertions(+), 2 deletions(-)
> > create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S
> >
> > diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile
> > index 056ce2ffc0..9f22e31b99 100644
> > --- a/sysdeps/riscv/multiarch/Makefile
> > +++ b/sysdeps/riscv/multiarch/Makefile
> > @@ -14,5 +14,6 @@ sysdep_routines += \
> > strcmp_generic \
> > strcmp_zbb \
> > strcmp_zbb_unaligned \
> > - strncmp_generic
> > + strncmp_generic \
> > + strncmp_zbb
> > endif
> > diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > index eb37ed6017..82fd34d010 100644
> > --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
> >
> > IFUNC_IMPL (i, name, strncmp,
> > + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb)
> > IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic))
> > return i;
> > }
> > diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/strncmp.c
> > index 970aeb8b85..5b0fe08e98 100644
> > --- a/sysdeps/riscv/multiarch/strncmp.c
> > +++ b/sysdeps/riscv/multiarch/strncmp.c
> > @@ -30,8 +30,12 @@
> >
> > extern __typeof (__redirect_strncmp) __libc_strncmp;
> > extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden;
> > +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden;
> >
> > -libc_ifunc (__libc_strncmp, __strncmp_generic);
> > +libc_ifunc (__libc_strncmp,
> > + HAVE_RV(zbb)
> > + ? __strncmp_zbb
> > + : __strncmp_generic);
> >
> > # undef strncmp
> > strong_alias (__libc_strncmp, strncmp);
> > diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multiarch/strncmp_zbb.S
> > new file mode 100644
> > index 0000000000..29cff30def
> > --- /dev/null
> > +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S
> > @@ -0,0 +1,119 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#include <sysdep.h>
> > +#include <sys/asm.h>
> > +
> > +/* Assumptions: rvi_zbb. */
> > +
> > +#define src1 a0
> > +#define result a0
> > +#define src2 a1
> > +#define len a2
> > +#define data1 a2
> > +#define data2 a3
> > +#define align a4
> > +#define data1_orcb t0
> > +#define limit t1
> > +#define fast_limit t2
> > +#define m1 t3
> > +
> > +#if __riscv_xlen == 64
> > +# define REG_L ld
> > +# define SZREG 8
> > +# define PTRLOG 3
> > +#else
> > +# define REG_L lw
> > +# define SZREG 4
> > +# define PTRLOG 2
> > +#endif
> > +
> > +#ifndef STRNCMP
> > +# define STRNCMP __strncmp_zbb
> > +#endif
> > +
> > +.option push
> > +.option arch,+zbb
> > +
> > +ENTRY_ALIGN (STRNCMP, 6)
> > + beqz len, L(equal)
> > + or align, src1, src2
> > + and align, align, SZREG-1
> > + add limit, src1, len
> > + bnez align, L(simpleloop)
> > + li m1, -1
> > +
> > + /* Adjust limit for fast-path. */
> > + andi fast_limit, limit, -SZREG
> > +
> > + /* Main loop for aligned string. */
> > + .p2align 3
> > +L(loop):
> > + bge src1, fast_limit, L(simpleloop)
> > + REG_L data1, 0(src1)
> > + REG_L data2, 0(src2)
> > + orc.b data1_orcb, data1
> > + bne data1_orcb, m1, L(foundnull)
> > + addi src1, src1, SZREG
> > + addi src2, src2, SZREG
> > + beq data1, data2, L(loop)
> > +
> > + /* Words don't match, and no null byte in the first
> > + * word. Get bytes in big-endian order and compare. */
> > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > + rev8 data1, data1
> > + rev8 data2, data2
> > +#endif
> > + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
> > + sltu result, data1, data2
> > + neg result, result
> > + ori result, result, 1
> > + ret
> > +
> > +L(foundnull):
> > + /* Found a null byte.
> > + * If words don't match, fall back to simple loop. */
> > + bne data1, data2, L(simpleloop)
> > +
> > + /* Otherwise, strings are equal. */
> > + li result, 0
> > + ret
> > +
> > + /* Simple loop for misaligned strings. */
> > + .p2align 3
> > +L(simpleloop):
> > + bge src1, limit, L(equal)
> > + lbu data1, 0(src1)
> > + addi src1, src1, 1
> > + lbu data2, 0(src2)
> > + addi src2, src2, 1
> > + bne data1, data2, L(sub)
> > + bnez data1, L(simpleloop)
> > +
> > +L(sub):
> > + sub result, data1, data2
> > + ret
> > +
> > +L(equal):
> > + li result, 0
> > + ret
> > +
> > +.option pop
> > +
> > +END (STRNCMP)
> > +libc_hidden_builtin_def (STRNCMP)
> > --
> > 2.39.1
> >
On Wed, 08 Feb 2023 07:13:44 PST (-0800), philipp.tomsich@vrull.eu wrote:
> On Tue, 7 Feb 2023 at 02:20, Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner
>> <christoph.muellner@vrull.eu> wrote:
>> >
>> > From: Christoph Müllner <christoph.muellner@vrull.eu>
>> >
>> > The implementation of strncmp() can be accelerated using Zbb's orc.b
>> > instruction. Let's add an optimized implementation that makes use
>> > of this instruction.
>> >
>> > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
>>
>> Not necessary, but imo performance patches should have at least some reference
>> to the expected speedup versus the existing alternatives.
>
> Given that this is effectively a SWAR-like optimization (orc.b allows
> us to test 8 bytes in parallel for a NUL byte), we should be able to
> show the benefit through a reduction in dynamic instructions. Would
> this be considered reasonable reference data?
Generally for performance improvements the only metrics that count come
from real hardware. Processor implementation is complex and it's not
generally true that reducing dynamic instructions results in better
performance (particularly when more complex flavors instructions replace
simpler ones).
We've not been so good about this on the RISC-V side of things, though.
I think that's largely because we didn't have all that much complexity
around this, but there's a ton of stuff showing up right now. The
general theory has been that Zbb instructions will execute faster than
their corresponding I sequences, but nobody has proved that. I believe
the new JH7110 has Zba and Zbb, so maybe the right answer there is to
just benchmark things before merging them? That way we can get back to
doing things sanely before we go too far down the premature optimization
rabbit hole.
FWIW: we had a pretty similar discussion in Linux land around these and
nobody could get the JH7110 to boot, but given that we have ~6 months
until glibc releases again hopefully that will be sorted out. There's a
bunch of ongoing work looking at the more core issues like probing, so
maybe it's best to focus on getting that all sorted out first? It's
kind of awkward to have a bunch of routines posted in a whole new
framework that's not sorting out all the probing dependencies.
>> > ---
>> > sysdeps/riscv/multiarch/Makefile | 3 +-
>> > sysdeps/riscv/multiarch/ifunc-impl-list.c | 1 +
>> > sysdeps/riscv/multiarch/strncmp.c | 6 +-
>> > sysdeps/riscv/multiarch/strncmp_zbb.S | 119 ++++++++++++++++++++++
>> > 4 files changed, 127 insertions(+), 2 deletions(-)
>> > create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S
>> >
>> > diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile
>> > index 056ce2ffc0..9f22e31b99 100644
>> > --- a/sysdeps/riscv/multiarch/Makefile
>> > +++ b/sysdeps/riscv/multiarch/Makefile
>> > @@ -14,5 +14,6 @@ sysdep_routines += \
>> > strcmp_generic \
>> > strcmp_zbb \
>> > strcmp_zbb_unaligned \
>> > - strncmp_generic
>> > + strncmp_generic \
>> > + strncmp_zbb
>> > endif
>> > diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c
>> > index eb37ed6017..82fd34d010 100644
>> > --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
>> > +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
>> > @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
>> >
>> > IFUNC_IMPL (i, name, strncmp,
>> > + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb)
>> > IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic))
>> > return i;
>> > }
>> > diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/strncmp.c
>> > index 970aeb8b85..5b0fe08e98 100644
>> > --- a/sysdeps/riscv/multiarch/strncmp.c
>> > +++ b/sysdeps/riscv/multiarch/strncmp.c
>> > @@ -30,8 +30,12 @@
>> >
>> > extern __typeof (__redirect_strncmp) __libc_strncmp;
>> > extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden;
>> > +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden;
>> >
>> > -libc_ifunc (__libc_strncmp, __strncmp_generic);
>> > +libc_ifunc (__libc_strncmp,
>> > + HAVE_RV(zbb)
>> > + ? __strncmp_zbb
>> > + : __strncmp_generic);
>> >
>> > # undef strncmp
>> > strong_alias (__libc_strncmp, strncmp);
>> > diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multiarch/strncmp_zbb.S
>> > new file mode 100644
>> > index 0000000000..29cff30def
>> > --- /dev/null
>> > +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S
>> > @@ -0,0 +1,119 @@
>> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
>> > +
>> > + This file is part of the GNU C Library.
>> > +
>> > + The GNU C Library is free software; you can redistribute it and/or
>> > + modify it under the terms of the GNU Lesser General Public
>> > + License as published by the Free Software Foundation; either
>> > + version 2.1 of the License, or (at your option) any later version.
>> > +
>> > + The GNU C Library is distributed in the hope that it will be useful,
>> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> > + Lesser General Public License for more details.
>> > +
>> > + You should have received a copy of the GNU Lesser General Public
>> > + License along with the GNU C Library. If not, see
>> > + <https://www.gnu.org/licenses/>. */
>> > +
>> > +#include <sysdep.h>
>> > +#include <sys/asm.h>
>> > +
>> > +/* Assumptions: rvi_zbb. */
>> > +
>> > +#define src1 a0
>> > +#define result a0
>> > +#define src2 a1
>> > +#define len a2
>> > +#define data1 a2
>> > +#define data2 a3
>> > +#define align a4
>> > +#define data1_orcb t0
>> > +#define limit t1
>> > +#define fast_limit t2
>> > +#define m1 t3
>> > +
>> > +#if __riscv_xlen == 64
>> > +# define REG_L ld
>> > +# define SZREG 8
>> > +# define PTRLOG 3
>> > +#else
>> > +# define REG_L lw
>> > +# define SZREG 4
>> > +# define PTRLOG 2
>> > +#endif
>> > +
>> > +#ifndef STRNCMP
>> > +# define STRNCMP __strncmp_zbb
>> > +#endif
>> > +
>> > +.option push
>> > +.option arch,+zbb
>> > +
>> > +ENTRY_ALIGN (STRNCMP, 6)
>> > + beqz len, L(equal)
>> > + or align, src1, src2
>> > + and align, align, SZREG-1
>> > + add limit, src1, len
>> > + bnez align, L(simpleloop)
>> > + li m1, -1
>> > +
>> > + /* Adjust limit for fast-path. */
>> > + andi fast_limit, limit, -SZREG
>> > +
>> > + /* Main loop for aligned string. */
>> > + .p2align 3
>> > +L(loop):
>> > + bge src1, fast_limit, L(simpleloop)
>> > + REG_L data1, 0(src1)
>> > + REG_L data2, 0(src2)
>> > + orc.b data1_orcb, data1
>> > + bne data1_orcb, m1, L(foundnull)
>> > + addi src1, src1, SZREG
>> > + addi src2, src2, SZREG
>> > + beq data1, data2, L(loop)
>> > +
>> > + /* Words don't match, and no null byte in the first
>> > + * word. Get bytes in big-endian order and compare. */
>> > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
>> > + rev8 data1, data1
>> > + rev8 data2, data2
>> > +#endif
>> > + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
>> > + sltu result, data1, data2
>> > + neg result, result
>> > + ori result, result, 1
>> > + ret
>> > +
>> > +L(foundnull):
>> > + /* Found a null byte.
>> > + * If words don't match, fall back to simple loop. */
>> > + bne data1, data2, L(simpleloop)
>> > +
>> > + /* Otherwise, strings are equal. */
>> > + li result, 0
>> > + ret
>> > +
>> > + /* Simple loop for misaligned strings. */
>> > + .p2align 3
>> > +L(simpleloop):
>> > + bge src1, limit, L(equal)
>> > + lbu data1, 0(src1)
>> > + addi src1, src1, 1
>> > + lbu data2, 0(src2)
>> > + addi src2, src2, 1
>> > + bne data1, data2, L(sub)
>> > + bnez data1, L(simpleloop)
>> > +
>> > +L(sub):
>> > + sub result, data1, data2
>> > + ret
>> > +
>> > +L(equal):
>> > + li result, 0
>> > + ret
>> > +
>> > +.option pop
>> > +
>> > +END (STRNCMP)
>> > +libc_hidden_builtin_def (STRNCMP)
>> > --
>> > 2.39.1
>> >
On Wed, Feb 8, 2023 at 9:13 AM Philipp Tomsich <philipp.tomsich@vrull.eu> wrote:
>
> On Tue, 7 Feb 2023 at 02:20, Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner
> > <christoph.muellner@vrull.eu> wrote:
> > >
> > > From: Christoph Müllner <christoph.muellner@vrull.eu>
> > >
> > > The implementation of strncmp() can be accelerated using Zbb's orc.b
> > > instruction. Let's add an optimized implementation that makes use
> > > of this instruction.
> > >
> > > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
> >
> > Not necessary, but imo performance patches should have at least some reference
> > to the expected speedup versus the existing alternatives.
>
> Given that this is effectively a SWAR-like optimization (orc.b allows
> us to test 8 bytes in parallel for a NUL byte), we should be able to
> show the benefit through a reduction in dynamic instructions. Would
> this be considered reasonable reference data?
>
GLIBC has a benchmark suite for all the string/memory functions so
would expect improvement in those results compared to the generic
implementations.
> > > ---
> > > sysdeps/riscv/multiarch/Makefile | 3 +-
> > > sysdeps/riscv/multiarch/ifunc-impl-list.c | 1 +
> > > sysdeps/riscv/multiarch/strncmp.c | 6 +-
> > > sysdeps/riscv/multiarch/strncmp_zbb.S | 119 ++++++++++++++++++++++
> > > 4 files changed, 127 insertions(+), 2 deletions(-)
> > > create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S
> > >
> > > diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile
> > > index 056ce2ffc0..9f22e31b99 100644
> > > --- a/sysdeps/riscv/multiarch/Makefile
> > > +++ b/sysdeps/riscv/multiarch/Makefile
> > > @@ -14,5 +14,6 @@ sysdep_routines += \
> > > strcmp_generic \
> > > strcmp_zbb \
> > > strcmp_zbb_unaligned \
> > > - strncmp_generic
> > > + strncmp_generic \
> > > + strncmp_zbb
> > > endif
> > > diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > > index eb37ed6017..82fd34d010 100644
> > > --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > > @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
> > >
> > > IFUNC_IMPL (i, name, strncmp,
> > > + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb)
> > > IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic))
> > > return i;
> > > }
> > > diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/strncmp.c
> > > index 970aeb8b85..5b0fe08e98 100644
> > > --- a/sysdeps/riscv/multiarch/strncmp.c
> > > +++ b/sysdeps/riscv/multiarch/strncmp.c
> > > @@ -30,8 +30,12 @@
> > >
> > > extern __typeof (__redirect_strncmp) __libc_strncmp;
> > > extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden;
> > > +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden;
> > >
> > > -libc_ifunc (__libc_strncmp, __strncmp_generic);
> > > +libc_ifunc (__libc_strncmp,
> > > + HAVE_RV(zbb)
> > > + ? __strncmp_zbb
> > > + : __strncmp_generic);
> > >
> > > # undef strncmp
> > > strong_alias (__libc_strncmp, strncmp);
> > > diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multiarch/strncmp_zbb.S
> > > new file mode 100644
> > > index 0000000000..29cff30def
> > > --- /dev/null
> > > +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S
> > > @@ -0,0 +1,119 @@
> > > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > > +
> > > + This file is part of the GNU C Library.
> > > +
> > > + The GNU C Library is free software; you can redistribute it and/or
> > > + modify it under the terms of the GNU Lesser General Public
> > > + License as published by the Free Software Foundation; either
> > > + version 2.1 of the License, or (at your option) any later version.
> > > +
> > > + The GNU C Library is distributed in the hope that it will be useful,
> > > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > > + Lesser General Public License for more details.
> > > +
> > > + You should have received a copy of the GNU Lesser General Public
> > > + License along with the GNU C Library. If not, see
> > > + <https://www.gnu.org/licenses/>. */
> > > +
> > > +#include <sysdep.h>
> > > +#include <sys/asm.h>
> > > +
> > > +/* Assumptions: rvi_zbb. */
> > > +
> > > +#define src1 a0
> > > +#define result a0
> > > +#define src2 a1
> > > +#define len a2
> > > +#define data1 a2
> > > +#define data2 a3
> > > +#define align a4
> > > +#define data1_orcb t0
> > > +#define limit t1
> > > +#define fast_limit t2
> > > +#define m1 t3
> > > +
> > > +#if __riscv_xlen == 64
> > > +# define REG_L ld
> > > +# define SZREG 8
> > > +# define PTRLOG 3
> > > +#else
> > > +# define REG_L lw
> > > +# define SZREG 4
> > > +# define PTRLOG 2
> > > +#endif
> > > +
> > > +#ifndef STRNCMP
> > > +# define STRNCMP __strncmp_zbb
> > > +#endif
> > > +
> > > +.option push
> > > +.option arch,+zbb
> > > +
> > > +ENTRY_ALIGN (STRNCMP, 6)
> > > + beqz len, L(equal)
> > > + or align, src1, src2
> > > + and align, align, SZREG-1
> > > + add limit, src1, len
> > > + bnez align, L(simpleloop)
> > > + li m1, -1
> > > +
> > > + /* Adjust limit for fast-path. */
> > > + andi fast_limit, limit, -SZREG
> > > +
> > > + /* Main loop for aligned string. */
> > > + .p2align 3
> > > +L(loop):
> > > + bge src1, fast_limit, L(simpleloop)
> > > + REG_L data1, 0(src1)
> > > + REG_L data2, 0(src2)
> > > + orc.b data1_orcb, data1
> > > + bne data1_orcb, m1, L(foundnull)
> > > + addi src1, src1, SZREG
> > > + addi src2, src2, SZREG
> > > + beq data1, data2, L(loop)
> > > +
> > > + /* Words don't match, and no null byte in the first
> > > + * word. Get bytes in big-endian order and compare. */
> > > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > > + rev8 data1, data1
> > > + rev8 data2, data2
> > > +#endif
> > > + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
> > > + sltu result, data1, data2
> > > + neg result, result
> > > + ori result, result, 1
> > > + ret
> > > +
> > > +L(foundnull):
> > > + /* Found a null byte.
> > > + * If words don't match, fall back to simple loop. */
> > > + bne data1, data2, L(simpleloop)
> > > +
> > > + /* Otherwise, strings are equal. */
> > > + li result, 0
> > > + ret
> > > +
> > > + /* Simple loop for misaligned strings. */
> > > + .p2align 3
> > > +L(simpleloop):
> > > + bge src1, limit, L(equal)
> > > + lbu data1, 0(src1)
> > > + addi src1, src1, 1
> > > + lbu data2, 0(src2)
> > > + addi src2, src2, 1
> > > + bne data1, data2, L(sub)
> > > + bnez data1, L(simpleloop)
> > > +
> > > +L(sub):
> > > + sub result, data1, data2
> > > + ret
> > > +
> > > +L(equal):
> > > + li result, 0
> > > + ret
> > > +
> > > +.option pop
> > > +
> > > +END (STRNCMP)
> > > +libc_hidden_builtin_def (STRNCMP)
> > > --
> > > 2.39.1
> > >
On 08/02/23 14:55, Palmer Dabbelt wrote:
> On Wed, 08 Feb 2023 07:13:44 PST (-0800), philipp.tomsich@vrull.eu wrote:
>> On Tue, 7 Feb 2023 at 02:20, Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>>
>>> On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner
>>> <christoph.muellner@vrull.eu> wrote:
>>> >
>>> > From: Christoph Müllner <christoph.muellner@vrull.eu>
>>> >
>>> > The implementation of strncmp() can be accelerated using Zbb's orc.b
>>> > instruction. Let's add an optimized implementation that makes use
>>> > of this instruction.
>>> >
>>> > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
>>>
>>> Not necessary, but imo performance patches should have at least some reference
>>> to the expected speedup versus the existing alternatives.
>>
>> Given that this is effectively a SWAR-like optimization (orc.b allows
>> us to test 8 bytes in parallel for a NUL byte), we should be able to
>> show the benefit through a reduction in dynamic instructions. Would
>> this be considered reasonable reference data?
>
> Generally for performance improvements the only metrics that count come from real hardware. Processor implementation is complex and it's not generally true that reducing dynamic instructions results in better performance (particularly when more complex flavors instructions replace simpler ones).
>
I agree with Noah here that we need to have some baseline performance number,
even tough we are comparing naive implementations (what glibc used to have for
implementations).
> We've not been so good about this on the RISC-V side of things, though. I think that's largely because we didn't have all that much complexity around this, but there's a ton of stuff showing up right now. The general theory has been that Zbb instructions will execute faster than their corresponding I sequences, but nobody has proved that. I believe the new JH7110 has Zba and Zbb, so maybe the right answer there is to just benchmark things before merging them? That way we can get back to doing things sanely before we go too far down the premature optimization rabbit hole.
>
> FWIW: we had a pretty similar discussion in Linux land around these and nobody could get the JH7110 to boot, but given that we have ~6 months until glibc releases again hopefully that will be sorted out. There's a bunch of ongoing work looking at the more core issues like probing, so maybe it's best to focus on getting that all sorted out first? It's kind of awkward to have a bunch of routines posted in a whole new framework that's not sorting out all the probing dependencies.
Just a heads up that with latest generic string routines optimization, all
str* routines should now use new zbb extensions (if compiler is instructed
to do so). I think you might squeeze some cycles with hand crafted assembly
routine, but I would rather focus on trying to optimize code generation
instead.
The generic routines still assumes that hardware can't or is prohibitive
expensive to issue unaligned memory access. However, I think we move toward
this direction to start adding unaligned variants when it makes sense.
Another usual tuning is loop unrolling, which depends on underlying hardware.
Unfortunately we need to explicit force gcc to unroll some loop construction
(for instance check sysdeps/powerpc/powerpc64/power4/Makefile), so this might
be another approach you might use to tune RISCV routines.
The memcpy, memmove, memset, memcmp are a slight different subject. Although
current generic mem routines does use some explicit unrolling, it also does
not take in consideration unaligned access, vector instructions, or special
instruction (such as cache clear one). And these usually make a lot of
difference.
What I would expect it maybe we can use a similar strategy Google is doing
with llvm libc, which based its work on the automemcpy paper [1]. It means
that for unaligned, each architecture will reimplement the memory routine
block. Although the project focus on static compiling, I think using hooks
over assembly routines might be a better approach (you might reuse code
blocks or try different strategies more easily).
[1] https://storage.googleapis.com/pub-tools-public-publication-data/pdf/4f7c3da72d557ed418828823a8e59942859d677f.pdf
@@ -14,5 +14,6 @@ sysdep_routines += \
strcmp_generic \
strcmp_zbb \
strcmp_zbb_unaligned \
- strncmp_generic
+ strncmp_generic \
+ strncmp_zbb
endif
@@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
IFUNC_IMPL (i, name, strncmp,
+ IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb)
IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic))
return i;
}
@@ -30,8 +30,12 @@
extern __typeof (__redirect_strncmp) __libc_strncmp;
extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden;
+extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden;
-libc_ifunc (__libc_strncmp, __strncmp_generic);
+libc_ifunc (__libc_strncmp,
+ HAVE_RV(zbb)
+ ? __strncmp_zbb
+ : __strncmp_generic);
# undef strncmp
strong_alias (__libc_strncmp, strncmp);
new file mode 100644
@@ -0,0 +1,119 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+/* Assumptions: rvi_zbb. */
+
+#define src1 a0
+#define result a0
+#define src2 a1
+#define len a2
+#define data1 a2
+#define data2 a3
+#define align a4
+#define data1_orcb t0
+#define limit t1
+#define fast_limit t2
+#define m1 t3
+
+#if __riscv_xlen == 64
+# define REG_L ld
+# define SZREG 8
+# define PTRLOG 3
+#else
+# define REG_L lw
+# define SZREG 4
+# define PTRLOG 2
+#endif
+
+#ifndef STRNCMP
+# define STRNCMP __strncmp_zbb
+#endif
+
+.option push
+.option arch,+zbb
+
+ENTRY_ALIGN (STRNCMP, 6)
+ beqz len, L(equal)
+ or align, src1, src2
+ and align, align, SZREG-1
+ add limit, src1, len
+ bnez align, L(simpleloop)
+ li m1, -1
+
+ /* Adjust limit for fast-path. */
+ andi fast_limit, limit, -SZREG
+
+ /* Main loop for aligned string. */
+ .p2align 3
+L(loop):
+ bge src1, fast_limit, L(simpleloop)
+ REG_L data1, 0(src1)
+ REG_L data2, 0(src2)
+ orc.b data1_orcb, data1
+ bne data1_orcb, m1, L(foundnull)
+ addi src1, src1, SZREG
+ addi src2, src2, SZREG
+ beq data1, data2, L(loop)
+
+ /* Words don't match, and no null byte in the first
+ * word. Get bytes in big-endian order and compare. */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ rev8 data1, data1
+ rev8 data2, data2
+#endif
+ /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
+ sltu result, data1, data2
+ neg result, result
+ ori result, result, 1
+ ret
+
+L(foundnull):
+ /* Found a null byte.
+ * If words don't match, fall back to simple loop. */
+ bne data1, data2, L(simpleloop)
+
+ /* Otherwise, strings are equal. */
+ li result, 0
+ ret
+
+ /* Simple loop for misaligned strings. */
+ .p2align 3
+L(simpleloop):
+ bge src1, limit, L(equal)
+ lbu data1, 0(src1)
+ addi src1, src1, 1
+ lbu data2, 0(src2)
+ addi src2, src2, 1
+ bne data1, data2, L(sub)
+ bnez data1, L(simpleloop)
+
+L(sub):
+ sub result, data1, data2
+ ret
+
+L(equal):
+ li result, 0
+ ret
+
+.option pop
+
+END (STRNCMP)
+libc_hidden_builtin_def (STRNCMP)