powerpc: Optimized strlen for POWER9

Message ID 20200501133213.68bfbec7@kryten.localdomain
State Superseded
Headers
Series powerpc: Optimized strlen for POWER9 |

Commit Message

Anton Blanchard May 1, 2020, 3:32 a.m. UTC
  This version performs much better than the POWER8 version for medium
length strings (50-100 bytes) as well as smaller gains on small
and long unaligned strings.
---
 sysdeps/powerpc/powerpc64/le/power9/strlen.S  | 102 ++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |   4 +
 .../powerpc64/multiarch/strlen-power9.S       |  24 +++++
 sysdeps/powerpc/powerpc64/multiarch/strlen.c  |  17 ++-
 5 files changed, 143 insertions(+), 6 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strlen.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
  

Comments

Adhemerval Zanella May 5, 2020, 9:10 p.m. UTC | #1
On 01/05/2020 00:32, Anton Blanchard via Libc-alpha wrote:
> This version performs much better than the POWER8 version for medium
> length strings (50-100 bytes) as well as smaller gains on small
> and long unaligned strings.

As for strcmp, it seems that it uses the ISA 3.0 partial stores to 
optimize vector instructions usage. Could you add it on the commit 
message?

Usually for such optimizations we try to get a baseline benchmark results
using glibc benchtests.  Could you post the results for before and after?


LGTM with some clarification below.

Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>

> ---
>  sysdeps/powerpc/powerpc64/le/power9/strlen.S  | 102 ++++++++++++++++++
>  sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
>  .../powerpc64/multiarch/ifunc-impl-list.c     |   4 +
>  .../powerpc64/multiarch/strlen-power9.S       |  24 +++++
>  sysdeps/powerpc/powerpc64/multiarch/strlen.c  |  17 ++-
>  5 files changed, 143 insertions(+), 6 deletions(-)
>  create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strlen.S
>  create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strlen.S b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
> new file mode 100644
> index 0000000000..afaa76907f
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
> @@ -0,0 +1,102 @@
> +/* Optimized strlen implementation for PowerPC64/POWER9.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#ifndef STRLEN
> +# define STRLEN strlen
> +#endif
> +
> +/* Implements the function
> +
> +   int [r3] strlen (char *s [r3])
> +
> +   The implementation can load bytes past a null terminator, but only
> +   up to the next 16B boundary, so it never crosses a page.  */
> +
> +.machine power9

I assume that minimal supported binutils won't barf on this, correct?

> +ENTRY_TOCLESS (STRLEN, 4)
> +	CALL_MCOUNT 1
> +
> +	vspltisb v19,-1		/* Ones in v19  */
> +	vspltisb v18,0		/* Zeroes in v18  */
> +
> +	neg	r5,r3
> +	rldicl	r9,r5,0,60	/* How many bytes to get source 16B aligned?  */
> +
> +	/* Align data and fill bytes not loaded with ones  */
> +	lvx	v0,0,r3
> +	lvsr	v1,0,r3
> +	vperm	v0,v19,v0,v1
> +
> +	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	beq	cr6,L(aligned)
> +
> +	vctzlsbb r3,v6
> +	blr
> +
> +L(aligned):
> +	add	r4,r3,r9
> +	mr	r3,r9
> +
> +L(loop):

Should we enforce alignment here?

> +	lxv	v0+32,0(r4)
> +	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	bne	cr6,L(tail1)
> +
> +	lxv	v0+32,16(r4)
> +	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	bne	cr6,L(tail2)
> +
> +	lxv	v0+32,32(r4)
> +	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	bne	cr6,L(tail3)
> +
> +	lxv	v0+32,48(r4)
> +	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	bne	cr6,L(tail4)

As for strcmp, why unroll 4x time here?

> +
> +	addi	r3,r3,64
> +	addi	r4,r4,64
> +	b	L(loop)
> +
> +L(tail1):
> +	vctzlsbb r0,v6
> +	add	r3,r3,r0
> +	blr
> +
> +L(tail2):
> +	vctzlsbb r0,v6
> +	add	r3,r3,r0
> +	addi	r3,r3,16
> +	blr
> +
> +L(tail3):
> +	vctzlsbb r0,v6
> +	add	r3,r3,r0
> +	addi	r3,r3,32
> +	blr
> +
> +L(tail4):
> +	vctzlsbb r0,v6
> +	add	r3,r3,r0
> +	addi	r3,r3,48
> +	blr
> +
> +END (STRLEN)
> +libc_hidden_builtin_def (strlen)

Ok.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index ea936bf9ed..1f9318bfbb 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
>  		   strncase-power8
>  
>  ifneq (,$(filter %le,$(config-machine)))
> -sysdep_routines += strcmp-power9 strncmp-power9
> +sysdep_routines += strcmp-power9 strncmp-power9 strlen-power9
>  endif
>  CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
>  CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops

Ok.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index b9fef3f43c..1fe7fb7812 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -103,6 +103,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c.  */
>    IFUNC_IMPL (i, name, strlen,
> +#ifdef __LITTLE_ENDIAN__
> +	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00,
> +			      __strlen_power9)
> +#endif
>  	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
>  			      __strlen_power8)
>  	      IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,

Ok.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
> new file mode 100644
> index 0000000000..223ff54c39
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
> @@ -0,0 +1,24 @@
> +/* Optimized strlen implementation for POWER8.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define STRLEN __strlen_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power9/strlen.S>

Ok.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
> index e587554221..c418c2bde4 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
> @@ -30,13 +30,20 @@ extern __typeof (__redirect_strlen) __libc_strlen;
>  extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
>  extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
>  extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (__redirect_strlen) __strlen_power9 attribute_hidden;
> +# endif
>  
>  libc_ifunc (__libc_strlen,
> -	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> -	    ? __strlen_power8 :
> -	      (hwcap & PPC_FEATURE_HAS_VSX)
> -	      ? __strlen_power7
> -	      : __strlen_ppc);
> +# ifdef __LITTLE_ENDIAN__
> +	    (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +	    ? __strlen_power9 :
> +# endif
> +	      (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> +	      ? __strlen_power8 :
> +	        (hwcap & PPC_FEATURE_HAS_VSX)
> +	        ? __strlen_power7
> +	        : __strlen_ppc);
>  
>  #undef strlen
>  strong_alias (__libc_strlen, strlen)
> 

Ok.
  

Patch

diff --git a/sysdeps/powerpc/powerpc64/le/power9/strlen.S b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
new file mode 100644
index 0000000000..afaa76907f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
@@ -0,0 +1,102 @@ 
+/* Optimized strlen implementation for PowerPC64/POWER9.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+
+/* Implements the function
+
+   int [r3] strlen (char *s [r3])
+
+   The implementation can load bytes past a null terminator, but only
+   up to the next 16B boundary, so it never crosses a page.  */
+
+.machine power9
+ENTRY_TOCLESS (STRLEN, 4)
+	CALL_MCOUNT 1
+
+	vspltisb v19,-1		/* Ones in v19  */
+	vspltisb v18,0		/* Zeroes in v18  */
+
+	neg	r5,r3
+	rldicl	r9,r5,0,60	/* How many bytes to get source 16B aligned?  */
+
+	/* Align data and fill bytes not loaded with ones  */
+	lvx	v0,0,r3
+	lvsr	v1,0,r3
+	vperm	v0,v19,v0,v1
+
+	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	beq	cr6,L(aligned)
+
+	vctzlsbb r3,v6
+	blr
+
+L(aligned):
+	add	r4,r3,r9
+	mr	r3,r9
+
+L(loop):
+	lxv	v0+32,0(r4)
+	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	bne	cr6,L(tail1)
+
+	lxv	v0+32,16(r4)
+	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	bne	cr6,L(tail2)
+
+	lxv	v0+32,32(r4)
+	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	bne	cr6,L(tail3)
+
+	lxv	v0+32,48(r4)
+	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	bne	cr6,L(tail4)
+
+	addi	r3,r3,64
+	addi	r4,r4,64
+	b	L(loop)
+
+L(tail1):
+	vctzlsbb r0,v6
+	add	r3,r3,r0
+	blr
+
+L(tail2):
+	vctzlsbb r0,v6
+	add	r3,r3,r0
+	addi	r3,r3,16
+	blr
+
+L(tail3):
+	vctzlsbb r0,v6
+	add	r3,r3,r0
+	addi	r3,r3,32
+	blr
+
+L(tail4):
+	vctzlsbb r0,v6
+	add	r3,r3,r0
+	addi	r3,r3,48
+	blr
+
+END (STRLEN)
+libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index ea936bf9ed..1f9318bfbb 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -32,7 +32,7 @@  sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 		   strncase-power8
 
 ifneq (,$(filter %le,$(config-machine)))
-sysdep_routines += strcmp-power9 strncmp-power9
+sysdep_routines += strcmp-power9 strncmp-power9 strlen-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index b9fef3f43c..1fe7fb7812 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -103,6 +103,10 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c.  */
   IFUNC_IMPL (i, name, strlen,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __strlen_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strlen_power8)
 	      IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
new file mode 100644
index 0000000000..223ff54c39
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
@@ -0,0 +1,24 @@ 
+/* Optimized strlen implementation for POWER8.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define STRLEN __strlen_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strlen.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
index e587554221..c418c2bde4 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
@@ -30,13 +30,20 @@  extern __typeof (__redirect_strlen) __libc_strlen;
 extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__redirect_strlen) __strlen_power9 attribute_hidden;
+# endif
 
 libc_ifunc (__libc_strlen,
-	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-	    ? __strlen_power8 :
-	      (hwcap & PPC_FEATURE_HAS_VSX)
-	      ? __strlen_power7
-	      : __strlen_ppc);
+# ifdef __LITTLE_ENDIAN__
+	    (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+	    ? __strlen_power9 :
+# endif
+	      (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+	      ? __strlen_power8 :
+	        (hwcap & PPC_FEATURE_HAS_VSX)
+	        ? __strlen_power7
+	        : __strlen_ppc);
 
 #undef strlen
 strong_alias (__libc_strlen, strlen)