[RFC,16/19] riscv: Add accelerated strcmp routines
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
From: Christoph Müllner <christoph.muellner@vrull.eu>
The implementation of strcmp() can be accelerated using Zbb's orc.b
instruction and fast unaligned accesses. Howver, strcmp can use
unaligned accesses only if such an address does not change the
exception behaviour (compared to a single-byte compare loop).
Let's add an implementation that does all that.
Additionally, let's add the strcmp implementation from the
Bitmanip specification, which does not do any unaligned accesses.
Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
---
sysdeps/riscv/multiarch/Makefile | 4 +-
sysdeps/riscv/multiarch/ifunc-impl-list.c | 4 +-
sysdeps/riscv/multiarch/strcmp.c | 11 +-
sysdeps/riscv/multiarch/strcmp_zbb.S | 104 +++++++++
.../riscv/multiarch/strcmp_zbb_unaligned.S | 213 ++++++++++++++++++
5 files changed, 332 insertions(+), 4 deletions(-)
create mode 100644 sysdeps/riscv/multiarch/strcmp_zbb.S
create mode 100644 sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
Comments
Is it possible to make the optimized generic string routine [1] ifunc-
aware in some way? Or can we "templatize" it so we can make vectorized
string routines more easily?
[1]: https://sourceware.org/pipermail/libc-alpha/2023-February/145211.html
On Tue, 2023-02-07 at 01:16 +0100, Christoph Muellner wrote:
> From: Christoph Müllner <christoph.muellner@vrull.eu>
>
> The implementation of strcmp() can be accelerated using Zbb's orc.b
> instruction and fast unaligned accesses. Howver, strcmp can use
> unaligned accesses only if such an address does not change the
> exception behaviour (compared to a single-byte compare loop).
> Let's add an implementation that does all that.
> Additionally, let's add the strcmp implementation from the
> Bitmanip specification, which does not do any unaligned accesses.
>
> Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
> ---
> sysdeps/riscv/multiarch/Makefile | 4 +-
> sysdeps/riscv/multiarch/ifunc-impl-list.c | 4 +-
> sysdeps/riscv/multiarch/strcmp.c | 11 +-
> sysdeps/riscv/multiarch/strcmp_zbb.S | 104 +++++++++
> .../riscv/multiarch/strcmp_zbb_unaligned.S | 213
> ++++++++++++++++++
> 5 files changed, 332 insertions(+), 4 deletions(-)
> create mode 100644 sysdeps/riscv/multiarch/strcmp_zbb.S
> create mode 100644 sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
>
> diff --git a/sysdeps/riscv/multiarch/Makefile
> b/sysdeps/riscv/multiarch/Makefile
> index 3017bde75a..73a62be85d 100644
> --- a/sysdeps/riscv/multiarch/Makefile
> +++ b/sysdeps/riscv/multiarch/Makefile
> @@ -11,5 +11,7 @@ sysdep_routines += \
> strlen_generic \
> strlen_zbb \
> \
> - strcmp_generic
> + strcmp_generic \
> + strcmp_zbb \
> + strcmp_zbb_unaligned
> endif
> diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c
> b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> index 64331a4c7f..d354aa1178 100644
> --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> @@ -59,7 +59,9 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
>
> IFUNC_IMPL (i, name, strcmp,
> - IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcmp_generic))
> + IFUNC_IMPL_ADD (array, i, strcmp, 1,
> __strcmp_zbb_unaligned)
> + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_zbb)
> + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
>
> return i;
> }
> diff --git a/sysdeps/riscv/multiarch/strcmp.c
> b/sysdeps/riscv/multiarch/strcmp.c
> index 8c21a90afd..d3f2fe19ae 100644
> --- a/sysdeps/riscv/multiarch/strcmp.c
> +++ b/sysdeps/riscv/multiarch/strcmp.c
> @@ -30,8 +30,15 @@
>
> extern __typeof (__redirect_strcmp) __libc_strcmp;
> extern __typeof (__redirect_strcmp) __strcmp_generic
> attribute_hidden;
> -
> -libc_ifunc (__libc_strcmp, __strcmp_generic);
> +extern __typeof (__redirect_strcmp) __strcmp_zbb attribute_hidden;
> +extern __typeof (__redirect_strcmp) __strcmp_zbb_unaligned
> attribute_hidden;
> +
> +libc_ifunc (__libc_strcmp,
> + HAVE_RV(zbb) && HAVE_FAST_UNALIGNED()
> + ? __strcmp_zbb_unaligned
> + : HAVE_RV(zbb)
> + ? __strcmp_zbb
> + : __strcmp_generic);
>
> # undef strcmp
> strong_alias (__libc_strcmp, strcmp);
> diff --git a/sysdeps/riscv/multiarch/strcmp_zbb.S
> b/sysdeps/riscv/multiarch/strcmp_zbb.S
> new file mode 100644
> index 0000000000..1c265d6107
> --- /dev/null
> +++ b/sysdeps/riscv/multiarch/strcmp_zbb.S
> @@ -0,0 +1,104 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be
> useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include <sys/asm.h>
> +
> +/* Assumptions: rvi_zbb. */
> +/* Implementation from the Bitmanip specification. */
> +
> +#define src1 a0
> +#define result a0
> +#define src2 a1
> +#define data1 a2
> +#define data2 a3
> +#define align a4
> +#define data1_orcb t0
> +#define m1 t2
> +
> +#if __riscv_xlen == 64
> +# define REG_L ld
> +# define SZREG 8
> +#else
> +# define REG_L lw
> +# define SZREG 4
> +#endif
> +
> +#ifndef STRCMP
> +# define STRCMP __strcmp_zbb
> +#endif
> +
> +.option push
> +.option arch,+zbb
> +
> +ENTRY_ALIGN (STRCMP, 6)
> + or align, src1, src2
> + and align, align, SZREG-1
> + bnez align, L(simpleloop)
> + li m1, -1
> +
> + /* Main loop for aligned strings. */
> + .p2align 2
> +L(loop):
> + REG_L data1, 0(src1)
> + REG_L data2, 0(src2)
> + orc.b data1_orcb, data1
> + bne data1_orcb, m1, L(foundnull)
> + addi src1, src1, SZREG
> + addi src2, src2, SZREG
> + beq data1, data2, L(loop)
> +
> + /* Words don't match, and no null byte in the first word.
> + * Get bytes in big-endian order and compare. */
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> + rev8 data1, data1
> + rev8 data2, data2
> +#endif
> + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless
> sequence. */
> + sltu result, data1, data2
> + neg result, result
> + ori result, result, 1
> + ret
> +
> +L(foundnull):
> + /* Found a null byte.
> + * If words don't match, fall back to simple loop. */
> + bne data1, data2, L(simpleloop)
> +
> + /* Otherwise, strings are equal. */
> + li result, 0
> + ret
> +
> + /* Simple loop for misaligned strings. */
> + .p2align 3
> +L(simpleloop):
> + lbu data1, 0(src1)
> + lbu data2, 0(src2)
> + addi src1, src1, 1
> + addi src2, src2, 1
> + bne data1, data2, L(sub)
> + bnez data1, L(simpleloop)
> +
> +L(sub):
> + sub result, data1, data2
> + ret
> +
> +.option pop
> +
> +END (STRCMP)
> +libc_hidden_builtin_def (STRCMP)
> diff --git a/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> b/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> new file mode 100644
> index 0000000000..ec21982b65
> --- /dev/null
> +++ b/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> @@ -0,0 +1,213 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be
> useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include <sys/asm.h>
> +
> +/* Assumptions: rvi_zbb with fast unaligned access. */
> +/* Implementation inspired by aarch64/strcmp.S. */
> +
> +#define src1 a0
> +#define result a0
> +#define src2 a1
> +#define off a3
> +#define m1 a4
> +#define align1 a5
> +#define src3 a6
> +#define tmp a7
> +
> +#define data1 t0
> +#define data2 t1
> +#define b1 t0
> +#define b2 t1
> +#define data3 t2
> +#define data1_orcb t3
> +#define data3_orcb t4
> +#define shift t5
> +
> +#if __riscv_xlen == 64
> +# define REG_L ld
> +# define SZREG 8
> +# define PTRLOG 3
> +#else
> +# define REG_L lw
> +# define SZREG 4
> +# define PTRLOG 2
> +#endif
> +
> +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
> +# error big endian is untested!
> +# define CZ ctz
> +# define SHIFT srl
> +# define SHIFT2 sll
> +#else
> +# define CZ ctz
> +# define SHIFT sll
> +# define SHIFT2 srl
> +#endif
> +
> +#ifndef STRCMP
> +# define STRCMP __strcmp_zbb_unaligned
> +#endif
> +
> +.option push
> +.option arch,+zbb
> +
> +ENTRY_ALIGN (STRCMP, 6)
> + /* off...delta from src1 to src2. */
> + sub off, src2, src1
> + li m1, -1
> + andi tmp, off, SZREG-1
> + andi align1, src1, SZREG-1
> + bnez tmp, L(misaligned8)
> + bnez align1, L(mutual_align)
> +
> + .p2align 4
> +L(loop_aligned):
> + REG_L data1, 0(src1)
> + add tmp, src1, off
> + addi src1, src1, SZREG
> + REG_L data2, 0(tmp)
> +
> +L(start_realigned):
> + orc.b data1_orcb, data1
> + bne data1_orcb, m1, L(end)
> + beq data1, data2, L(loop_aligned)
> +
> +L(fast_end):
> + /* Words don't match, and no NUL byte in one word.
> + Get bytes in big-endian order and compare as words. */
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> + rev8 data1, data1
> + rev8 data2, data2
> +#endif
> + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless
> sequence. */
> + sltu result, data1, data2
> + neg result, result
> + ori result, result, 1
> + ret
> +
> +L(end_orc):
> + orc.b data1_orcb, data1
> +L(end):
> + /* Words don't match or NUL byte in at least one word.
> + data1_orcb holds orc.b value of data1. */
> + xor tmp, data1, data2
> + orc.b tmp, tmp
> +
> + orn tmp, tmp, data1_orcb
> + CZ shift, tmp
> +
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> + rev8 data1, data1
> + rev8 data2, data2
> +#endif
> + sll data1, data1, shift
> + sll data2, data2, shift
> + srl b1, data1, SZREG*8-8
> + srl b2, data2, SZREG*8-8
> +
> +L(end_singlebyte):
> + sub result, b1, b2
> + ret
> +
> + .p2align 4
> +L(mutual_align):
> + /* Sources are mutually aligned, but are not currently at an
> + alignment boundary. Round down the addresses and then mask
> off
> + the bytes that precede the start point. */
> + andi src1, src1, -SZREG
> + add tmp, src1, off
> + REG_L data1, 0(src1)
> + addi src1, src1, SZREG
> + REG_L data2, 0(tmp)
> + /* Get number of bits to mask. */
> + sll shift, src2, 3
> + /* Bits to mask are now 0, others are 1. */
> + SHIFT tmp, m1, shift
> + /* Or with inverted value -> masked bits become 1. */
> + orn data1, data1, tmp
> + orn data2, data2, tmp
> + j L(start_realigned)
> +
> +L(misaligned8):
> + /* Skip slow loop if SRC1 is aligned. */
> + beqz align1, L(src1_aligned)
> +L(do_misaligned):
> + /* Align SRC1 to 8 bytes. */
> + lbu b1, 0(src1)
> + lbu b2, 0(src2)
> + beqz b1, L(end_singlebyte)
> + bne b1, b2, L(end_singlebyte)
> + addi src1, src1, 1
> + addi src2, src2, 1
> + andi align1, src1, SZREG-1
> + bnez align1, L(do_misaligned)
> +
> +L(src1_aligned):
> + /* SRC1 is aligned. Align SRC2 down and check for NUL there.
> + * If there is no NUL, we may read the next word from SRC2.
> + * If there is a NUL, we must not read a complete word from
> SRC2
> + * because we might cross a page boundary. */
> + /* Get number of bits to mask (upper bits are ignored by
> shifts). */
> + sll shift, src2, 3
> + /* src3 := align_down (src2) */
> + andi src3, src2, -SZREG
> + REG_L data3, 0(src3)
> + addi src3, src3, SZREG
> +
> + /* Bits to mask are now 0, others are 1. */
> + SHIFT tmp, m1, shift
> + /* Or with inverted value -> masked bits become 1. */
> + orn data3_orcb, data3, tmp
> + /* Check for NUL in next aligned word. */
> + orc.b data3_orcb, data3_orcb
> + bne data3_orcb, m1, L(unaligned_nul)
> +
> + .p2align 4
> +L(loop_unaligned):
> + /* Read the (aligned) data1 and the unaligned data2. */
> + REG_L data1, 0(src1)
> + addi src1, src1, SZREG
> + REG_L data2, 0(src2)
> + addi src2, src2, SZREG
> + orc.b data1_orcb, data1
> + bne data1_orcb, m1, L(end)
> + bne data1, data2, L(end)
> +
> + /* Read the next aligned-down word. */
> + REG_L data3, 0(src3)
> + addi src3, src3, SZREG
> + orc.b data3_orcb, data3
> + beq data3_orcb, m1, L(loop_unaligned)
> +
> +L(unaligned_nul):
> + /* src1 points to unread word (only first bytes relevant).
> + * data3 holds next aligned-down word with NUL.
> + * Compare the first bytes of data1 with the last bytes of
> data3. */
> + REG_L data1, 0(src1)
> + /* Shift NUL bytes into data3 to become data2. */
> + SHIFT2 data2, data3, shift
> + bne data1, data2, L(end_orc)
> + li result, 0
> + ret
> +
> +.option pop
> +
> +END (STRCMP)
> +libc_hidden_builtin_def (STRCMP)
On Tue, Feb 7, 2023 at 12:57 PM Xi Ruoyao <xry111@xry111.site> wrote:
> Is it possible to make the optimized generic string routine [1] ifunc-
> aware in some way? Or can we "templatize" it so we can make vectorized
> string routines more easily?
>
You mean to compile the generic code multiple times with different sets
of enabled extensions? Yes, that should work (my patchset does the same
for memset). Let's wait for the referenced patchset to land (looks like this
will happen soon).
Thanks,
Christoph
>
> [1]: https://sourceware.org/pipermail/libc-alpha/2023-February/145211.html
>
> On Tue, 2023-02-07 at 01:16 +0100, Christoph Muellner wrote:
> > From: Christoph Müllner <christoph.muellner@vrull.eu>
> >
> > The implementation of strcmp() can be accelerated using Zbb's orc.b
> > instruction and fast unaligned accesses. Howver, strcmp can use
> > unaligned accesses only if such an address does not change the
> > exception behaviour (compared to a single-byte compare loop).
> > Let's add an implementation that does all that.
> > Additionally, let's add the strcmp implementation from the
> > Bitmanip specification, which does not do any unaligned accesses.
> >
> > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
> > ---
> > sysdeps/riscv/multiarch/Makefile | 4 +-
> > sysdeps/riscv/multiarch/ifunc-impl-list.c | 4 +-
> > sysdeps/riscv/multiarch/strcmp.c | 11 +-
> > sysdeps/riscv/multiarch/strcmp_zbb.S | 104 +++++++++
> > .../riscv/multiarch/strcmp_zbb_unaligned.S | 213
> > ++++++++++++++++++
> > 5 files changed, 332 insertions(+), 4 deletions(-)
> > create mode 100644 sysdeps/riscv/multiarch/strcmp_zbb.S
> > create mode 100644 sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> >
> > diff --git a/sysdeps/riscv/multiarch/Makefile
> > b/sysdeps/riscv/multiarch/Makefile
> > index 3017bde75a..73a62be85d 100644
> > --- a/sysdeps/riscv/multiarch/Makefile
> > +++ b/sysdeps/riscv/multiarch/Makefile
> > @@ -11,5 +11,7 @@ sysdep_routines += \
> > strlen_generic \
> > strlen_zbb \
> > \
> > - strcmp_generic
> > + strcmp_generic \
> > + strcmp_zbb \
> > + strcmp_zbb_unaligned
> > endif
> > diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > index 64331a4c7f..d354aa1178 100644
> > --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c
> > @@ -59,7 +59,9 @@ __libc_ifunc_impl_list (const char *name, struct
> > libc_ifunc_impl *array,
> > IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
> >
> > IFUNC_IMPL (i, name, strcmp,
> > - IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcmp_generic))
> > + IFUNC_IMPL_ADD (array, i, strcmp, 1,
> > __strcmp_zbb_unaligned)
> > + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_zbb)
> > + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
> >
> > return i;
> > }
> > diff --git a/sysdeps/riscv/multiarch/strcmp.c
> > b/sysdeps/riscv/multiarch/strcmp.c
> > index 8c21a90afd..d3f2fe19ae 100644
> > --- a/sysdeps/riscv/multiarch/strcmp.c
> > +++ b/sysdeps/riscv/multiarch/strcmp.c
> > @@ -30,8 +30,15 @@
> >
> > extern __typeof (__redirect_strcmp) __libc_strcmp;
> > extern __typeof (__redirect_strcmp) __strcmp_generic
> > attribute_hidden;
> > -
> > -libc_ifunc (__libc_strcmp, __strcmp_generic);
> > +extern __typeof (__redirect_strcmp) __strcmp_zbb attribute_hidden;
> > +extern __typeof (__redirect_strcmp) __strcmp_zbb_unaligned
> > attribute_hidden;
> > +
> > +libc_ifunc (__libc_strcmp,
> > + HAVE_RV(zbb) && HAVE_FAST_UNALIGNED()
> > + ? __strcmp_zbb_unaligned
> > + : HAVE_RV(zbb)
> > + ? __strcmp_zbb
> > + : __strcmp_generic);
> >
> > # undef strcmp
> > strong_alias (__libc_strcmp, strcmp);
> > diff --git a/sysdeps/riscv/multiarch/strcmp_zbb.S
> > b/sysdeps/riscv/multiarch/strcmp_zbb.S
> > new file mode 100644
> > index 0000000000..1c265d6107
> > --- /dev/null
> > +++ b/sysdeps/riscv/multiarch/strcmp_zbb.S
> > @@ -0,0 +1,104 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be
> > useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#include <sysdep.h>
> > +#include <sys/asm.h>
> > +
> > +/* Assumptions: rvi_zbb. */
> > +/* Implementation from the Bitmanip specification. */
> > +
> > +#define src1 a0
> > +#define result a0
> > +#define src2 a1
> > +#define data1 a2
> > +#define data2 a3
> > +#define align a4
> > +#define data1_orcb t0
> > +#define m1 t2
> > +
> > +#if __riscv_xlen == 64
> > +# define REG_L ld
> > +# define SZREG 8
> > +#else
> > +# define REG_L lw
> > +# define SZREG 4
> > +#endif
> > +
> > +#ifndef STRCMP
> > +# define STRCMP __strcmp_zbb
> > +#endif
> > +
> > +.option push
> > +.option arch,+zbb
> > +
> > +ENTRY_ALIGN (STRCMP, 6)
> > + or align, src1, src2
> > + and align, align, SZREG-1
> > + bnez align, L(simpleloop)
> > + li m1, -1
> > +
> > + /* Main loop for aligned strings. */
> > + .p2align 2
> > +L(loop):
> > + REG_L data1, 0(src1)
> > + REG_L data2, 0(src2)
> > + orc.b data1_orcb, data1
> > + bne data1_orcb, m1, L(foundnull)
> > + addi src1, src1, SZREG
> > + addi src2, src2, SZREG
> > + beq data1, data2, L(loop)
> > +
> > + /* Words don't match, and no null byte in the first word.
> > + * Get bytes in big-endian order and compare. */
> > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > + rev8 data1, data1
> > + rev8 data2, data2
> > +#endif
> > + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless
> > sequence. */
> > + sltu result, data1, data2
> > + neg result, result
> > + ori result, result, 1
> > + ret
> > +
> > +L(foundnull):
> > + /* Found a null byte.
> > + * If words don't match, fall back to simple loop. */
> > + bne data1, data2, L(simpleloop)
> > +
> > + /* Otherwise, strings are equal. */
> > + li result, 0
> > + ret
> > +
> > + /* Simple loop for misaligned strings. */
> > + .p2align 3
> > +L(simpleloop):
> > + lbu data1, 0(src1)
> > + lbu data2, 0(src2)
> > + addi src1, src1, 1
> > + addi src2, src2, 1
> > + bne data1, data2, L(sub)
> > + bnez data1, L(simpleloop)
> > +
> > +L(sub):
> > + sub result, data1, data2
> > + ret
> > +
> > +.option pop
> > +
> > +END (STRCMP)
> > +libc_hidden_builtin_def (STRCMP)
> > diff --git a/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> > b/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> > new file mode 100644
> > index 0000000000..ec21982b65
> > --- /dev/null
> > +++ b/sysdeps/riscv/multiarch/strcmp_zbb_unaligned.S
> > @@ -0,0 +1,213 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > +
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be
> > useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library. If not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#include <sysdep.h>
> > +#include <sys/asm.h>
> > +
> > +/* Assumptions: rvi_zbb with fast unaligned access. */
> > +/* Implementation inspired by aarch64/strcmp.S. */
> > +
> > +#define src1 a0
> > +#define result a0
> > +#define src2 a1
> > +#define off a3
> > +#define m1 a4
> > +#define align1 a5
> > +#define src3 a6
> > +#define tmp a7
> > +
> > +#define data1 t0
> > +#define data2 t1
> > +#define b1 t0
> > +#define b2 t1
> > +#define data3 t2
> > +#define data1_orcb t3
> > +#define data3_orcb t4
> > +#define shift t5
> > +
> > +#if __riscv_xlen == 64
> > +# define REG_L ld
> > +# define SZREG 8
> > +# define PTRLOG 3
> > +#else
> > +# define REG_L lw
> > +# define SZREG 4
> > +# define PTRLOG 2
> > +#endif
> > +
> > +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
> > +# error big endian is untested!
> > +# define CZ ctz
> > +# define SHIFT srl
> > +# define SHIFT2 sll
> > +#else
> > +# define CZ ctz
> > +# define SHIFT sll
> > +# define SHIFT2 srl
> > +#endif
> > +
> > +#ifndef STRCMP
> > +# define STRCMP __strcmp_zbb_unaligned
> > +#endif
> > +
> > +.option push
> > +.option arch,+zbb
> > +
> > +ENTRY_ALIGN (STRCMP, 6)
> > + /* off...delta from src1 to src2. */
> > + sub off, src2, src1
> > + li m1, -1
> > + andi tmp, off, SZREG-1
> > + andi align1, src1, SZREG-1
> > + bnez tmp, L(misaligned8)
> > + bnez align1, L(mutual_align)
> > +
> > + .p2align 4
> > +L(loop_aligned):
> > + REG_L data1, 0(src1)
> > + add tmp, src1, off
> > + addi src1, src1, SZREG
> > + REG_L data2, 0(tmp)
> > +
> > +L(start_realigned):
> > + orc.b data1_orcb, data1
> > + bne data1_orcb, m1, L(end)
> > + beq data1, data2, L(loop_aligned)
> > +
> > +L(fast_end):
> > + /* Words don't match, and no NUL byte in one word.
> > + Get bytes in big-endian order and compare as words. */
> > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > + rev8 data1, data1
> > + rev8 data2, data2
> > +#endif
> > + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless
> > sequence. */
> > + sltu result, data1, data2
> > + neg result, result
> > + ori result, result, 1
> > + ret
> > +
> > +L(end_orc):
> > + orc.b data1_orcb, data1
> > +L(end):
> > + /* Words don't match or NUL byte in at least one word.
> > + data1_orcb holds orc.b value of data1. */
> > + xor tmp, data1, data2
> > + orc.b tmp, tmp
> > +
> > + orn tmp, tmp, data1_orcb
> > + CZ shift, tmp
> > +
> > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> > + rev8 data1, data1
> > + rev8 data2, data2
> > +#endif
> > + sll data1, data1, shift
> > + sll data2, data2, shift
> > + srl b1, data1, SZREG*8-8
> > + srl b2, data2, SZREG*8-8
> > +
> > +L(end_singlebyte):
> > + sub result, b1, b2
> > + ret
> > +
> > + .p2align 4
> > +L(mutual_align):
> > + /* Sources are mutually aligned, but are not currently at an
> > + alignment boundary. Round down the addresses and then mask
> > off
> > + the bytes that precede the start point. */
> > + andi src1, src1, -SZREG
> > + add tmp, src1, off
> > + REG_L data1, 0(src1)
> > + addi src1, src1, SZREG
> > + REG_L data2, 0(tmp)
> > + /* Get number of bits to mask. */
> > + sll shift, src2, 3
> > + /* Bits to mask are now 0, others are 1. */
> > + SHIFT tmp, m1, shift
> > + /* Or with inverted value -> masked bits become 1. */
> > + orn data1, data1, tmp
> > + orn data2, data2, tmp
> > + j L(start_realigned)
> > +
> > +L(misaligned8):
> > + /* Skip slow loop if SRC1 is aligned. */
> > + beqz align1, L(src1_aligned)
> > +L(do_misaligned):
> > + /* Align SRC1 to 8 bytes. */
> > + lbu b1, 0(src1)
> > + lbu b2, 0(src2)
> > + beqz b1, L(end_singlebyte)
> > + bne b1, b2, L(end_singlebyte)
> > + addi src1, src1, 1
> > + addi src2, src2, 1
> > + andi align1, src1, SZREG-1
> > + bnez align1, L(do_misaligned)
> > +
> > +L(src1_aligned):
> > + /* SRC1 is aligned. Align SRC2 down and check for NUL there.
> > + * If there is no NUL, we may read the next word from SRC2.
> > + * If there is a NUL, we must not read a complete word from
> > SRC2
> > + * because we might cross a page boundary. */
> > + /* Get number of bits to mask (upper bits are ignored by
> > shifts). */
> > + sll shift, src2, 3
> > + /* src3 := align_down (src2) */
> > + andi src3, src2, -SZREG
> > + REG_L data3, 0(src3)
> > + addi src3, src3, SZREG
> > +
> > + /* Bits to mask are now 0, others are 1. */
> > + SHIFT tmp, m1, shift
> > + /* Or with inverted value -> masked bits become 1. */
> > + orn data3_orcb, data3, tmp
> > + /* Check for NUL in next aligned word. */
> > + orc.b data3_orcb, data3_orcb
> > + bne data3_orcb, m1, L(unaligned_nul)
> > +
> > + .p2align 4
> > +L(loop_unaligned):
> > + /* Read the (aligned) data1 and the unaligned data2. */
> > + REG_L data1, 0(src1)
> > + addi src1, src1, SZREG
> > + REG_L data2, 0(src2)
> > + addi src2, src2, SZREG
> > + orc.b data1_orcb, data1
> > + bne data1_orcb, m1, L(end)
> > + bne data1, data2, L(end)
> > +
> > + /* Read the next aligned-down word. */
> > + REG_L data3, 0(src3)
> > + addi src3, src3, SZREG
> > + orc.b data3_orcb, data3
> > + beq data3_orcb, m1, L(loop_unaligned)
> > +
> > +L(unaligned_nul):
> > + /* src1 points to unread word (only first bytes relevant).
> > + * data3 holds next aligned-down word with NUL.
> > + * Compare the first bytes of data1 with the last bytes of
> > data3. */
> > + REG_L data1, 0(src1)
> > + /* Shift NUL bytes into data3 to become data2. */
> > + SHIFT2 data2, data3, shift
> > + bne data1, data2, L(end_orc)
> > + li result, 0
> > + ret
> > +
> > +.option pop
> > +
> > +END (STRCMP)
> > +libc_hidden_builtin_def (STRCMP)
>
> --
> Xi Ruoyao <xry111@xry111.site>
> School of Aerospace Science and Technology, Xidian University
>
@@ -11,5 +11,7 @@ sysdep_routines += \
strlen_generic \
strlen_zbb \
\
- strcmp_generic
+ strcmp_generic \
+ strcmp_zbb \
+ strcmp_zbb_unaligned
endif
@@ -59,7 +59,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
IFUNC_IMPL (i, name, strcmp,
- IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcmp_generic))
+ IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_zbb_unaligned)
+ IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_zbb)
+ IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic))
return i;
}
@@ -30,8 +30,15 @@
extern __typeof (__redirect_strcmp) __libc_strcmp;
extern __typeof (__redirect_strcmp) __strcmp_generic attribute_hidden;
-
-libc_ifunc (__libc_strcmp, __strcmp_generic);
+extern __typeof (__redirect_strcmp) __strcmp_zbb attribute_hidden;
+extern __typeof (__redirect_strcmp) __strcmp_zbb_unaligned attribute_hidden;
+
+libc_ifunc (__libc_strcmp,
+ HAVE_RV(zbb) && HAVE_FAST_UNALIGNED()
+ ? __strcmp_zbb_unaligned
+ : HAVE_RV(zbb)
+ ? __strcmp_zbb
+ : __strcmp_generic);
# undef strcmp
strong_alias (__libc_strcmp, strcmp);
new file mode 100644
@@ -0,0 +1,104 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+/* Assumptions: rvi_zbb. */
+/* Implementation from the Bitmanip specification. */
+
+#define src1 a0
+#define result a0
+#define src2 a1
+#define data1 a2
+#define data2 a3
+#define align a4
+#define data1_orcb t0
+#define m1 t2
+
+#if __riscv_xlen == 64
+# define REG_L ld
+# define SZREG 8
+#else
+# define REG_L lw
+# define SZREG 4
+#endif
+
+#ifndef STRCMP
+# define STRCMP __strcmp_zbb
+#endif
+
+.option push
+.option arch,+zbb
+
+ENTRY_ALIGN (STRCMP, 6)
+ or align, src1, src2
+ and align, align, SZREG-1
+ bnez align, L(simpleloop)
+ li m1, -1
+
+ /* Main loop for aligned strings. */
+ .p2align 2
+L(loop):
+ REG_L data1, 0(src1)
+ REG_L data2, 0(src2)
+ orc.b data1_orcb, data1
+ bne data1_orcb, m1, L(foundnull)
+ addi src1, src1, SZREG
+ addi src2, src2, SZREG
+ beq data1, data2, L(loop)
+
+ /* Words don't match, and no null byte in the first word.
+ * Get bytes in big-endian order and compare. */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ rev8 data1, data1
+ rev8 data2, data2
+#endif
+ /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
+ sltu result, data1, data2
+ neg result, result
+ ori result, result, 1
+ ret
+
+L(foundnull):
+ /* Found a null byte.
+ * If words don't match, fall back to simple loop. */
+ bne data1, data2, L(simpleloop)
+
+ /* Otherwise, strings are equal. */
+ li result, 0
+ ret
+
+ /* Simple loop for misaligned strings. */
+ .p2align 3
+L(simpleloop):
+ lbu data1, 0(src1)
+ lbu data2, 0(src2)
+ addi src1, src1, 1
+ addi src2, src2, 1
+ bne data1, data2, L(sub)
+ bnez data1, L(simpleloop)
+
+L(sub):
+ sub result, data1, data2
+ ret
+
+.option pop
+
+END (STRCMP)
+libc_hidden_builtin_def (STRCMP)
new file mode 100644
@@ -0,0 +1,213 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+/* Assumptions: rvi_zbb with fast unaligned access. */
+/* Implementation inspired by aarch64/strcmp.S. */
+
+#define src1 a0
+#define result a0
+#define src2 a1
+#define off a3
+#define m1 a4
+#define align1 a5
+#define src3 a6
+#define tmp a7
+
+#define data1 t0
+#define data2 t1
+#define b1 t0
+#define b2 t1
+#define data3 t2
+#define data1_orcb t3
+#define data3_orcb t4
+#define shift t5
+
+#if __riscv_xlen == 64
+# define REG_L ld
+# define SZREG 8
+# define PTRLOG 3
+#else
+# define REG_L lw
+# define SZREG 4
+# define PTRLOG 2
+#endif
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+# error big endian is untested!
+# define CZ ctz
+# define SHIFT srl
+# define SHIFT2 sll
+#else
+# define CZ ctz
+# define SHIFT sll
+# define SHIFT2 srl
+#endif
+
+#ifndef STRCMP
+# define STRCMP __strcmp_zbb_unaligned
+#endif
+
+.option push
+.option arch,+zbb
+
+ENTRY_ALIGN (STRCMP, 6)
+ /* off...delta from src1 to src2. */
+ sub off, src2, src1
+ li m1, -1
+ andi tmp, off, SZREG-1
+ andi align1, src1, SZREG-1
+ bnez tmp, L(misaligned8)
+ bnez align1, L(mutual_align)
+
+ .p2align 4
+L(loop_aligned):
+ REG_L data1, 0(src1)
+ add tmp, src1, off
+ addi src1, src1, SZREG
+ REG_L data2, 0(tmp)
+
+L(start_realigned):
+ orc.b data1_orcb, data1
+ bne data1_orcb, m1, L(end)
+ beq data1, data2, L(loop_aligned)
+
+L(fast_end):
+ /* Words don't match, and no NUL byte in one word.
+ Get bytes in big-endian order and compare as words. */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ rev8 data1, data1
+ rev8 data2, data2
+#endif
+ /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
+ sltu result, data1, data2
+ neg result, result
+ ori result, result, 1
+ ret
+
+L(end_orc):
+ orc.b data1_orcb, data1
+L(end):
+ /* Words don't match or NUL byte in at least one word.
+ data1_orcb holds orc.b value of data1. */
+ xor tmp, data1, data2
+ orc.b tmp, tmp
+
+ orn tmp, tmp, data1_orcb
+ CZ shift, tmp
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ rev8 data1, data1
+ rev8 data2, data2
+#endif
+ sll data1, data1, shift
+ sll data2, data2, shift
+ srl b1, data1, SZREG*8-8
+ srl b2, data2, SZREG*8-8
+
+L(end_singlebyte):
+ sub result, b1, b2
+ ret
+
+ .p2align 4
+L(mutual_align):
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that precede the start point. */
+ andi src1, src1, -SZREG
+ add tmp, src1, off
+ REG_L data1, 0(src1)
+ addi src1, src1, SZREG
+ REG_L data2, 0(tmp)
+ /* Get number of bits to mask. */
+ sll shift, src2, 3
+ /* Bits to mask are now 0, others are 1. */
+ SHIFT tmp, m1, shift
+ /* Or with inverted value -> masked bits become 1. */
+ orn data1, data1, tmp
+ orn data2, data2, tmp
+ j L(start_realigned)
+
+L(misaligned8):
+ /* Skip slow loop if SRC1 is aligned. */
+ beqz align1, L(src1_aligned)
+L(do_misaligned):
+ /* Align SRC1 to 8 bytes. */
+ lbu b1, 0(src1)
+ lbu b2, 0(src2)
+ beqz b1, L(end_singlebyte)
+ bne b1, b2, L(end_singlebyte)
+ addi src1, src1, 1
+ addi src2, src2, 1
+ andi align1, src1, SZREG-1
+ bnez align1, L(do_misaligned)
+
+L(src1_aligned):
+ /* SRC1 is aligned. Align SRC2 down and check for NUL there.
+ * If there is no NUL, we may read the next word from SRC2.
+ * If there is a NUL, we must not read a complete word from SRC2
+ * because we might cross a page boundary. */
+ /* Get number of bits to mask (upper bits are ignored by shifts). */
+ sll shift, src2, 3
+ /* src3 := align_down (src2) */
+ andi src3, src2, -SZREG
+ REG_L data3, 0(src3)
+ addi src3, src3, SZREG
+
+ /* Bits to mask are now 0, others are 1. */
+ SHIFT tmp, m1, shift
+ /* Or with inverted value -> masked bits become 1. */
+ orn data3_orcb, data3, tmp
+ /* Check for NUL in next aligned word. */
+ orc.b data3_orcb, data3_orcb
+ bne data3_orcb, m1, L(unaligned_nul)
+
+ .p2align 4
+L(loop_unaligned):
+ /* Read the (aligned) data1 and the unaligned data2. */
+ REG_L data1, 0(src1)
+ addi src1, src1, SZREG
+ REG_L data2, 0(src2)
+ addi src2, src2, SZREG
+ orc.b data1_orcb, data1
+ bne data1_orcb, m1, L(end)
+ bne data1, data2, L(end)
+
+ /* Read the next aligned-down word. */
+ REG_L data3, 0(src3)
+ addi src3, src3, SZREG
+ orc.b data3_orcb, data3
+ beq data3_orcb, m1, L(loop_unaligned)
+
+L(unaligned_nul):
+ /* src1 points to unread word (only first bytes relevant).
+ * data3 holds next aligned-down word with NUL.
+ * Compare the first bytes of data1 with the last bytes of data3. */
+ REG_L data1, 0(src1)
+ /* Shift NUL bytes into data3 to become data2. */
+ SHIFT2 data2, data3, shift
+ bne data1, data2, L(end_orc)
+ li result, 0
+ ret
+
+.option pop
+
+END (STRCMP)
+libc_hidden_builtin_def (STRCMP)