powerpc: Optimized strlen for POWER9
Commit Message
This version performs much better than the POWER8 version for medium
length strings (50-100 bytes) as well as smaller gains on small
and long unaligned strings.
---
sysdeps/powerpc/powerpc64/le/power9/strlen.S | 102 ++++++++++++++++++
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
.../powerpc64/multiarch/ifunc-impl-list.c | 4 +
.../powerpc64/multiarch/strlen-power9.S | 24 +++++
sysdeps/powerpc/powerpc64/multiarch/strlen.c | 17 ++-
5 files changed, 143 insertions(+), 6 deletions(-)
create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strlen.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
Comments
On 01/05/2020 00:32, Anton Blanchard via Libc-alpha wrote:
> This version performs much better than the POWER8 version for medium
> length strings (50-100 bytes) as well as smaller gains on small
> and long unaligned strings.
As for strcmp, it seems that it uses the ISA 3.0 partial stores to
optimize vector instructions usage. Could you add it on the commit
message?
Usually for such optimizations we try to get a baseline benchmark results
using glibc benchtests. Could you post the results for before and after?
LGTM with some clarification below.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
> ---
> sysdeps/powerpc/powerpc64/le/power9/strlen.S | 102 ++++++++++++++++++
> sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
> .../powerpc64/multiarch/ifunc-impl-list.c | 4 +
> .../powerpc64/multiarch/strlen-power9.S | 24 +++++
> sysdeps/powerpc/powerpc64/multiarch/strlen.c | 17 ++-
> 5 files changed, 143 insertions(+), 6 deletions(-)
> create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strlen.S
> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
>
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strlen.S b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
> new file mode 100644
> index 0000000000..afaa76907f
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
> @@ -0,0 +1,102 @@
> +/* Optimized strlen implementation for PowerPC64/POWER9.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +#ifndef STRLEN
> +# define STRLEN strlen
> +#endif
> +
> +/* Implements the function
> +
> + int [r3] strlen (char *s [r3])
> +
> + The implementation can load bytes past a null terminator, but only
> + up to the next 16B boundary, so it never crosses a page. */
> +
> +.machine power9
I assume that minimal supported binutils won't barf on this, correct?
> +ENTRY_TOCLESS (STRLEN, 4)
> + CALL_MCOUNT 1
> +
> + vspltisb v19,-1 /* Ones in v19 */
> + vspltisb v18,0 /* Zeroes in v18 */
> +
> + neg r5,r3
> + rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */
> +
> + /* Align data and fill bytes not loaded with ones */
> + lvx v0,0,r3
> + lvsr v1,0,r3
> + vperm v0,v19,v0,v1
> +
> + vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
> + beq cr6,L(aligned)
> +
> + vctzlsbb r3,v6
> + blr
> +
> +L(aligned):
> + add r4,r3,r9
> + mr r3,r9
> +
> +L(loop):
Should we enforce alignment here?
> + lxv v0+32,0(r4)
> + vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
> + bne cr6,L(tail1)
> +
> + lxv v0+32,16(r4)
> + vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
> + bne cr6,L(tail2)
> +
> + lxv v0+32,32(r4)
> + vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
> + bne cr6,L(tail3)
> +
> + lxv v0+32,48(r4)
> + vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
> + bne cr6,L(tail4)
As for strcmp, why unroll 4x time here?
> +
> + addi r3,r3,64
> + addi r4,r4,64
> + b L(loop)
> +
> +L(tail1):
> + vctzlsbb r0,v6
> + add r3,r3,r0
> + blr
> +
> +L(tail2):
> + vctzlsbb r0,v6
> + add r3,r3,r0
> + addi r3,r3,16
> + blr
> +
> +L(tail3):
> + vctzlsbb r0,v6
> + add r3,r3,r0
> + addi r3,r3,32
> + blr
> +
> +L(tail4):
> + vctzlsbb r0,v6
> + add r3,r3,r0
> + addi r3,r3,48
> + blr
> +
> +END (STRLEN)
> +libc_hidden_builtin_def (strlen)
Ok.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index ea936bf9ed..1f9318bfbb 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
> strncase-power8
>
> ifneq (,$(filter %le,$(config-machine)))
> -sysdep_routines += strcmp-power9 strncmp-power9
> +sysdep_routines += strcmp-power9 strncmp-power9 strlen-power9
> endif
> CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
> CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
Ok.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index b9fef3f43c..1fe7fb7812 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -103,6 +103,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c. */
> IFUNC_IMPL (i, name, strlen,
> +#ifdef __LITTLE_ENDIAN__
> + IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00,
> + __strlen_power9)
> +#endif
> IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
> __strlen_power8)
> IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
Ok.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
> new file mode 100644
> index 0000000000..223ff54c39
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
> @@ -0,0 +1,24 @@
> +/* Optimized strlen implementation for POWER8.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define STRLEN __strlen_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power9/strlen.S>
Ok.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
> index e587554221..c418c2bde4 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
> @@ -30,13 +30,20 @@ extern __typeof (__redirect_strlen) __libc_strlen;
> extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
> extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
> extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (__redirect_strlen) __strlen_power9 attribute_hidden;
> +# endif
>
> libc_ifunc (__libc_strlen,
> - (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> - ? __strlen_power8 :
> - (hwcap & PPC_FEATURE_HAS_VSX)
> - ? __strlen_power7
> - : __strlen_ppc);
> +# ifdef __LITTLE_ENDIAN__
> + (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> + ? __strlen_power9 :
> +# endif
> + (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> + ? __strlen_power8 :
> + (hwcap & PPC_FEATURE_HAS_VSX)
> + ? __strlen_power7
> + : __strlen_ppc);
>
> #undef strlen
> strong_alias (__libc_strlen, strlen)
>
Ok.
new file mode 100644
@@ -0,0 +1,102 @@
+/* Optimized strlen implementation for PowerPC64/POWER9.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+
+/* Implements the function
+
+ int [r3] strlen (char *s [r3])
+
+ The implementation can load bytes past a null terminator, but only
+ up to the next 16B boundary, so it never crosses a page. */
+
+.machine power9
+ENTRY_TOCLESS (STRLEN, 4)
+ CALL_MCOUNT 1
+
+ vspltisb v19,-1 /* Ones in v19 */
+ vspltisb v18,0 /* Zeroes in v18 */
+
+ neg r5,r3
+ rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */
+
+ /* Align data and fill bytes not loaded with ones */
+ lvx v0,0,r3
+ lvsr v1,0,r3
+ vperm v0,v19,v0,v1
+
+ vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
+ beq cr6,L(aligned)
+
+ vctzlsbb r3,v6
+ blr
+
+L(aligned):
+ add r4,r3,r9
+ mr r3,r9
+
+L(loop):
+ lxv v0+32,0(r4)
+ vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
+ bne cr6,L(tail1)
+
+ lxv v0+32,16(r4)
+ vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
+ bne cr6,L(tail2)
+
+ lxv v0+32,32(r4)
+ vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
+ bne cr6,L(tail3)
+
+ lxv v0+32,48(r4)
+ vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
+ bne cr6,L(tail4)
+
+ addi r3,r3,64
+ addi r4,r4,64
+ b L(loop)
+
+L(tail1):
+ vctzlsbb r0,v6
+ add r3,r3,r0
+ blr
+
+L(tail2):
+ vctzlsbb r0,v6
+ add r3,r3,r0
+ addi r3,r3,16
+ blr
+
+L(tail3):
+ vctzlsbb r0,v6
+ add r3,r3,r0
+ addi r3,r3,32
+ blr
+
+L(tail4):
+ vctzlsbb r0,v6
+ add r3,r3,r0
+ addi r3,r3,48
+ blr
+
+END (STRLEN)
+libc_hidden_builtin_def (strlen)
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
strncase-power8
ifneq (,$(filter %le,$(config-machine)))
-sysdep_routines += strcmp-power9 strncmp-power9
+sysdep_routines += strcmp-power9 strncmp-power9 strlen-power9
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
@@ -103,6 +103,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c. */
IFUNC_IMPL (i, name, strlen,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ __strlen_power9)
+#endif
IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strlen_power8)
IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
new file mode 100644
@@ -0,0 +1,24 @@
+/* Optimized strlen implementation for POWER8.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define STRLEN __strlen_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strlen.S>
@@ -30,13 +30,20 @@ extern __typeof (__redirect_strlen) __libc_strlen;
extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__redirect_strlen) __strlen_power9 attribute_hidden;
+# endif
libc_ifunc (__libc_strlen,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07)
- ? __strlen_power8 :
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strlen_power7
- : __strlen_ppc);
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ ? __strlen_power9 :
+# endif
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strlen_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strlen_power7
+ : __strlen_ppc);
#undef strlen
strong_alias (__libc_strlen, strlen)