diff mbox series

powerpc64le: add optimized strlen for P9

Message ID 20200521191048.1566568-1-murphyp@linux.vnet.ibm.com
State New
Headers show
Series powerpc64le: add optimized strlen for P9 | expand

Commit Message

Paul E. Murphy May 21, 2020, 7:10 p.m. UTC
This is a followup to rawmemchr/strlen from Anton.  I missed
his original strlen patch, and likewise I wasn't happy with
the 3-4% performance drop for larger strings which occurs
around 2.5kB as the P8 vector loop is a bit faster.  As noted,
this is up to 50% faster for small strings, and about 1% faster
for larger strings (I hazard to guess this some uarch difference
between lxv and lvx).

I guess this is a semi-V2 of the patch.  Likewise, I need to
double check binutils 2.26 supports the P9 insn used here.

---8<---

This started as a trivial change to Anton's rawmemchr.  I got
carried away.  This is a hybrid between P8's asympotically
faster 64B checks with extremely efficient small string checks
e.g <64B (and sometimes a little bit more depending on alignment).

The second trick is to align to 64B by running a 48B checking loop
16B at a time until we naturally align to 64B (i.e checking 48/96/144
bytes/iteration based on the alignment after the first 5 comparisons).
This allieviates the need to check page boundaries.

Finally, explicly use the P7 strlen with the runtime loader when building
P9.  We need to be cautious about vector/vsx extensions here on P9 only
builds.
---
 .../powerpc/powerpc64/le/power9/rtld-strlen.S |   1 +
 sysdeps/powerpc/powerpc64/le/power9/strlen.S  | 215 ++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |   4 +
 .../powerpc64/multiarch/strlen-power9.S       |   2 +
 sysdeps/powerpc/powerpc64/multiarch/strlen.c  |   5 +
 6 files changed, 228 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strlen.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S

Comments

Lucas A. M. Magalhaes May 21, 2020, 8:41 p.m. UTC | #1
Quoting Paul E. Murphy via Libc-alpha (2020-05-21 16:10:48)
> This is a followup to rawmemchr/strlen from Anton.  I missed
> his original strlen patch, and likewise I wasn't happy with
> the 3-4% performance drop for larger strings which occurs
> around 2.5kB as the P8 vector loop is a bit faster.  As noted,
> this is up to 50% faster for small strings, and about 1% faster
> for larger strings (I hazard to guess this some uarch difference
> between lxv and lvx).
> 
> I guess this is a semi-V2 of the patch.  Likewise, I need to
> double check binutils 2.26 supports the P9 insn used here.
> 
> ---8<---
> 
> This started as a trivial change to Anton's rawmemchr.  I got
> carried away.  This is a hybrid between P8's asympotically
> faster 64B checks with extremely efficient small string checks
> e.g <64B (and sometimes a little bit more depending on alignment).
> 
> The second trick is to align to 64B by running a 48B checking loop
> 16B at a time until we naturally align to 64B (i.e checking 48/96/144
> bytes/iteration based on the alignment after the first 5 comparisons).
> This allieviates the need to check page boundaries.
> 
> Finally, explicly use the P7 strlen with the runtime loader when building
> P9.  We need to be cautious about vector/vsx extensions here on P9 only
> builds.
> ---
>  .../powerpc/powerpc64/le/power9/rtld-strlen.S |   1 +
>  sysdeps/powerpc/powerpc64/le/power9/strlen.S  | 215 ++++++++++++++++++
>  sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
>  .../powerpc64/multiarch/ifunc-impl-list.c     |   4 +
>  .../powerpc64/multiarch/strlen-power9.S       |   2 +
>  sysdeps/powerpc/powerpc64/multiarch/strlen.c  |   5 +
>  6 files changed, 228 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S
>  create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strlen.S
>  create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S b/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S
> new file mode 100644
> index 0000000000..e9d83323ac
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S
> @@ -0,0 +1 @@
> +#include <sysdeps/powerpc/powerpc64/power7/strlen.S>
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strlen.S b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
> new file mode 100644
> index 0000000000..084d6e31a8
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
> @@ -0,0 +1,215 @@
> +
> +/* Optimized rawmemchr implementation for PowerPC64/POWER9.
s/rawmemchr/strlen

Still trying to understand the rest of the patch though. =)

---
Lucas A. M. Magalhães
Paul A. Clarke May 27, 2020, 4:45 p.m. UTC | #2
On Thu, May 21, 2020 at 02:10:48PM -0500, Paul E. Murphy via Libc-alpha wrote:
> This is a followup to rawmemchr/strlen from Anton.  I missed
> his original strlen patch, and likewise I wasn't happy with
> the 3-4% performance drop for larger strings which occurs
> around 2.5kB as the P8 vector loop is a bit faster.  As noted,
> this is up to 50% faster for small strings, and about 1% faster
> for larger strings (I hazard to guess this some uarch difference
> between lxv and lvx).
> 
> I guess this is a semi-V2 of the patch.  Likewise, I need to
> double check binutils 2.26 supports the P9 insn used here.
> 
> ---8<---
> 
> This started as a trivial change to Anton's rawmemchr.  I got
> carried away.  This is a hybrid between P8's asympotically
> faster 64B checks with extremely efficient small string checks
> e.g <64B (and sometimes a little bit more depending on alignment).
> 
> The second trick is to align to 64B by running a 48B checking loop
> 16B at a time until we naturally align to 64B (i.e checking 48/96/144
> bytes/iteration based on the alignment after the first 5 comparisons).
> This allieviates the need to check page boundaries.
> 
> Finally, explicly use the P7 strlen with the runtime loader when building
> P9.  We need to be cautious about vector/vsx extensions here on P9 only
> builds.
> ---
>  .../powerpc/powerpc64/le/power9/rtld-strlen.S |   1 +
>  sysdeps/powerpc/powerpc64/le/power9/strlen.S  | 215 ++++++++++++++++++
>  sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
>  .../powerpc64/multiarch/ifunc-impl-list.c     |   4 +
>  .../powerpc64/multiarch/strlen-power9.S       |   2 +
>  sysdeps/powerpc/powerpc64/multiarch/strlen.c  |   5 +
>  6 files changed, 228 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S
>  create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strlen.S
>  create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S b/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S
> new file mode 100644
> index 0000000000..e9d83323ac
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S
> @@ -0,0 +1 @@
> +#include <sysdeps/powerpc/powerpc64/power7/strlen.S>
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strlen.S b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
> new file mode 100644
> index 0000000000..084d6e31a8
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
> @@ -0,0 +1,215 @@
> +
> +/* Optimized rawmemchr implementation for PowerPC64/POWER9.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#ifndef STRLEN
> +# define STRLEN __strlen
> +# define DEFINE_STRLEN_HIDDEN_DEF 1
> +#endif
> +
> +/* Implements the function
> +
> +   int [r3] strlen (void *s [r3])

const void *s?

> +
> +   The implementation can load bytes past a matching byte, but only
> +   up to the next 16B or 64B boundary, so it never crosses a page.  */
> +
> +.machine power9
> +ENTRY_TOCLESS (STRLEN, 4)
> +	CALL_MCOUNT 2
> +
> +	mr r4, r3

This can be moved later, and folded into the "add" below.
In my experiments, it helped performance for tiny strings.

extra space after comma.

> +	vspltisb v18, 0
> +	vspltisb v19, -1

extra spaces after commas.

> +
> +	neg	r5,r3
> +	rldicl	r9,r5,0,60	/* How many bytes to get source 16B aligned?  */
> +
> +
> +	/* Align data and fill bytes not loaded with non matching char  */

Missing '.' after 'char', but I suggest some different comments (subjective)...

Consider:
/* Load cache line containing beginning of string.  */

> +	lvx	v0,0,r4

Consider:
/* Create permute vector to shift into alignment.  */

> +	lvsr	v1,0,r4

To move the "mr" above later, both of the above instructions would thus
need to use "r3" instead of "r4".

Consider:
/* Shift into alignment, filling with 0xff.  */

> +	vperm	v0,v19,v0,v1
> +
> +	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */

need a '.' after 'otherwise'.

> +	beq	cr6,L(aligned)
> +

Consider:
/* String ends within first cache line.  Compute length.  */

> +	vctzlsbb r3,v6
> +	blr
> +
> +	/* Test 64B 16B at a time.  The vector loop is costly for small strings. */

Consider:
/* Test 64B 16B at a time.  Postpone the vector loop ("loop", below), which
   is costly for small strings.  */

> +L(aligned):
> +	add	r4,r4,r9

And this can change to "add r4,r3,r9".

> +
> +	rldicl. r5, r4, 60, 62	/* Determine how many 48B loops we should run */

/* Determine how many 48B loops we should run in "loop" below.
   48B loops perform better than simpler 16B loops.  */

extra spaces after commas

Should this calculation be moved down, just before its use at "beq", or does
it schedule better if left here?  Since the result is not used until after
the next 14 instructions, strings of these lengths are penalized.

> +
> +	lxv	v0+32,0(r4)

Is the "+32" needed to accommodate a binutils that doesn't support VSX registers?

> +	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */

need a '.' after 'otherwise'.

> +	bne	cr6,L(tail1)
> +
> +	lxv	v0+32,16(r4)
> +	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */

need a '.' after 'otherwise'.

> +	bne	cr6,L(tail2)
> +
> +	lxv	v0+32,32(r4)
> +	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */

need a '.' after 'otherwise'.

> +	bne	cr6,L(tail3)
> +
> +	lxv	v0+32,48(r4)
> +	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */

need a '.' after 'otherwise'.

> +	bne	cr6,L(tail4)
> +	addi 	r4, r4, 64

extra spaces after commas.

> +
> +	/* prep for weird constant generation of reduction */

Need leading capitalization ("Prep...").  But, maybe a better comment instead...
/* Load a dummy aligned address (0) so that 'lvsl' produces a shift vector
 * of 0..15.  */

> +	li r0, 0

Extra space after ','
Would it be bad to move this just a little closer to where it is used much
later?

> +
> +	/* Skip the alignment if not needed */

need a '.' after 'needed'.

(Above "rldicl." could be moved as late as here.)

> +	beq L(loop_64b)
> +	mtctr r5
> +
> +	/* Test 48B per iteration until 64B aligned  */

need a '.' after 'aligned'.

> +	.p2align 5
> +L(loop):
> +	lxv	v0+32,0(r4)
> +	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */

need a '.' after 'otherwise'.

> +	bne	cr6,L(tail1)
> +
> +	lxv	v0+32,16(r4)
> +	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */

here, too.

> +	bne	cr6,L(tail2)
> +
> +	lxv	v0+32,32(r4)
> +	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */

and here.

> +	bne	cr6,L(tail3)
> +
> +	addi	r4,r4,48
> +	bdnz	L(loop)
> +
> +	.p2align 5
> +L(loop_64b):
> +	lxv	  v1+32, 0(r4)  /* Load 4 quadwords.  */
> +	lxv	  v2+32, 16(r4)
> +	lxv	  v3+32, 32(r4)
> +	lxv	  v4+32, 48(r4)
> +	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
> +	vminub	  v6,v3,v4
> +	vminub	  v7,v5,v6
> +	vcmpequb. v7,v7,v18  /* Check for NULLs.  */
> +	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
> +	bne	  cr6,L(vmx_zero)
> +
> +	lxv	  v1+32, 0(r4)  /* Load 4 quadwords.  */
> +	lxv	  v2+32, 16(r4)
> +	lxv	  v3+32, 32(r4)
> +	lxv	  v4+32, 48(r4)
> +	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
> +	vminub	  v6,v3,v4
> +	vminub	  v7,v5,v6
> +	vcmpequb. v7,v7,v18  /* Check for NULLs.  */
> +	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
> +	bne	  cr6,L(vmx_zero)
> +
> +	lxv	  v1+32, 0(r4)  /* Load 4 quadwords.  */
> +	lxv	  v2+32, 16(r4)
> +	lxv	  v3+32, 32(r4)
> +	lxv	  v4+32, 48(r4)
> +	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
> +	vminub	  v6,v3,v4
> +	vminub	  v7,v5,v6
> +	vcmpequb. v7,v7,v18  /* Check for NULLs.  */
> +	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
> +	beq	  cr6,L(loop_64b)

Curious how much this loop unrolling helps, since it adds a fair bit of
redundant code?

> +
> +L(vmx_zero):
> +	/* OK, we found a null byte.  Let's look for it in the current 64-byte
> +	   block and mark it in its corresponding VR.  */
> +	vcmpequb  v1,v1,v18
> +	vcmpequb  v2,v2,v18
> +	vcmpequb  v3,v3,v18
> +	vcmpequb  v4,v4,v18
> +
> +	/* We will now 'compress' the result into a single doubleword, so it
> +	   can be moved to a GPR for the final calculation.  First, we
> +	   generate an appropriate mask for vbpermq, so we can permute bits into
> +	   the first halfword.  */

I'm wondering (without having verified) if you can do something here akin to
what's done in the "tail" sections below, using "vctzlsbb".

> +	vspltisb  v10,3
> +	lvsl	  v11,r0,r0

Second field should probably be "0" instead of "r0" ("v11,0,r0").

> +	vslb	  v10,v11,v10
> +
> +	/* Permute the first bit of each byte into bits 48-63.  */
> +	vbpermq	v1,v1,v10
> +	vbpermq	v2,v2,v10
> +	vbpermq	v3,v3,v10
> +	vbpermq	v4,v4,v10
> +
> +	/* Shift each component into its correct position for merging.  */
> +	vsldoi  v2,v2,v2,2
> +	vsldoi  v3,v3,v3,4
> +	vsldoi  v4,v4,v4,6
> +
> +	/* Merge the results and move to a GPR.  */
> +	vor	v1,v2,v1
> +	vor	v2,v3,v4
> +	vor	v4,v1,v2
> +	mfvrd	r10,v4
> +
> +	 /* Adjust address to the begninning of the current 64-byte block.  */
> +	addi	r4,r4,-64
> +
> +	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
> +	andc	r9, r9,r10
> +	popcntd	r0, r9	      /* Count the bits in the mask.  */

extra spaces after the first comma in the above 3 lines.

> +	subf	r5,r3,r4
> +	add     r3,r5,r0      /* Compute final length.  */
> +	blr
> +
> +L(tail1):
> +	vctzlsbb r0,v6
> +	add	r4,r4,r0
> +	subf	r3,r3,r4
> +	blr
> +
> +L(tail2):
> +	vctzlsbb r0,v6
> +	add	r4,r4,r0
> +	addi	r4,r4,16
> +	subf	r3,r3,r4
> +	blr
> +
> +L(tail3):
> +	vctzlsbb r0,v6
> +	add	r4,r4,r0
> +	addi	r4,r4,32
> +	subf	r3,r3,r4
> +	blr
> +
> +L(tail4):
> +	vctzlsbb r0,v6
> +	add	r4,r4,r0
> +	addi	r4,r4,48
> +	subf	r3,r3,r4
> +	blr
> +
> +END (STRLEN)
(snip)

PC
Paul E Murphy May 29, 2020, 4:26 p.m. UTC | #3
V3 is attached with changes to formatting and a couple of 
simplifications as noted below.

On 5/27/20 11:45 AM, Paul A. Clarke wrote:
> On Thu, May 21, 2020 at 02:10:48PM -0500, Paul E. Murphy via Libc-alpha wrote:

>> +/* Implements the function
>> +
>> +   int [r3] strlen (void *s [r3])
> 
> const void *s?

Fixed, alongside folding away the mr r3,r4.  Likewise, the basic GNU 
formatting requests, and removed some of the more redundant ones.  Thank 
you for the suggested changes.

>> +	.p2align 5
>> +L(loop_64b):
>> +	lxv	  v1+32, 0(r4)  /* Load 4 quadwords.  */
>> +	lxv	  v2+32, 16(r4)
>> +	lxv	  v3+32, 32(r4)
>> +	lxv	  v4+32, 48(r4)
>> +	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
>> +	vminub	  v6,v3,v4
>> +	vminub	  v7,v5,v6
>> +	vcmpequb. v7,v7,v18  /* Check for NULLs.  */
>> +	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
>> +	bne	  cr6,L(vmx_zero)
>> +
>> +	lxv	  v1+32, 0(r4)  /* Load 4 quadwords.  */
>> +	lxv	  v2+32, 16(r4)
>> +	lxv	  v3+32, 32(r4)
>> +	lxv	  v4+32, 48(r4)
>> +	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
>> +	vminub	  v6,v3,v4
>> +	vminub	  v7,v5,v6
>> +	vcmpequb. v7,v7,v18  /* Check for NULLs.  */
>> +	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
>> +	bne	  cr6,L(vmx_zero)
>> +
>> +	lxv	  v1+32, 0(r4)  /* Load 4 quadwords.  */
>> +	lxv	  v2+32, 16(r4)
>> +	lxv	  v3+32, 32(r4)
>> +	lxv	  v4+32, 48(r4)
>> +	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
>> +	vminub	  v6,v3,v4
>> +	vminub	  v7,v5,v6
>> +	vcmpequb. v7,v7,v18  /* Check for NULLs.  */
>> +	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
>> +	beq	  cr6,L(loop_64b)
> 
> Curious how much this loop unrolling helps, since it adds a fair bit of
> redundant code?

It does seem to help a little bit, though maybe just an artifact of the 
benchsuite.

> 
>> +
>> +L(vmx_zero):
>> +	/* OK, we found a null byte.  Let's look for it in the current 64-byte
>> +	   block and mark it in its corresponding VR.  */
>> +	vcmpequb  v1,v1,v18
>> +	vcmpequb  v2,v2,v18
>> +	vcmpequb  v3,v3,v18
>> +	vcmpequb  v4,v4,v18
>> +
>> +	/* We will now 'compress' the result into a single doubleword, so it
>> +	   can be moved to a GPR for the final calculation.  First, we
>> +	   generate an appropriate mask for vbpermq, so we can permute bits into
>> +	   the first halfword.  */
> 
> I'm wondering (without having verified) if you can do something here akin to
> what's done in the "tail" sections below, using "vctzlsbb".

It does not help when the content spans more than 1 VR.  I don't think 
there is much to improve for a 64b mask reduction.  Though, we can save 
a couple cycles below using cnttzd (new in ISA 3.0).
diff mbox series

Patch

diff --git a/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S b/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S
new file mode 100644
index 0000000000..e9d83323ac
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S
@@ -0,0 +1 @@ 
+#include <sysdeps/powerpc/powerpc64/power7/strlen.S>
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strlen.S b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
new file mode 100644
index 0000000000..084d6e31a8
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
@@ -0,0 +1,215 @@ 
+
+/* Optimized rawmemchr implementation for PowerPC64/POWER9.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef STRLEN
+# define STRLEN __strlen
+# define DEFINE_STRLEN_HIDDEN_DEF 1
+#endif
+
+/* Implements the function
+
+   int [r3] strlen (void *s [r3])
+
+   The implementation can load bytes past a matching byte, but only
+   up to the next 16B or 64B boundary, so it never crosses a page.  */
+
+.machine power9
+ENTRY_TOCLESS (STRLEN, 4)
+	CALL_MCOUNT 2
+
+	mr r4, r3
+	vspltisb v18, 0
+	vspltisb v19, -1
+
+	neg	r5,r3
+	rldicl	r9,r5,0,60	/* How many bytes to get source 16B aligned?  */
+
+
+	/* Align data and fill bytes not loaded with non matching char  */
+	lvx	v0,0,r4
+	lvsr	v1,0,r4
+	vperm	v0,v19,v0,v1
+
+	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */
+	beq	cr6,L(aligned)
+
+	vctzlsbb r3,v6
+	blr
+
+	/* Test 64B 16B at a time.  The vector loop is costly for small strings. */
+L(aligned):
+	add	r4,r4,r9
+
+	rldicl. r5, r4, 60, 62	/* Determine how many 48B loops we should run */
+
+	lxv	v0+32,0(r4)
+	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */
+	bne	cr6,L(tail1)
+
+	lxv	v0+32,16(r4)
+	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */
+	bne	cr6,L(tail2)
+
+	lxv	v0+32,32(r4)
+	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */
+	bne	cr6,L(tail3)
+
+	lxv	v0+32,48(r4)
+	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */
+	bne	cr6,L(tail4)
+	addi 	r4, r4, 64
+
+	/* prep for weird constant generation of reduction */
+	li r0, 0
+
+	/* Skip the alignment if not needed */
+	beq L(loop_64b)
+	mtctr r5
+
+	/* Test 48B per iteration until 64B aligned  */
+	.p2align 5
+L(loop):
+	lxv	v0+32,0(r4)
+	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */
+	bne	cr6,L(tail1)
+
+	lxv	v0+32,16(r4)
+	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */
+	bne	cr6,L(tail2)
+
+	lxv	v0+32,32(r4)
+	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */
+	bne	cr6,L(tail3)
+
+	addi	r4,r4,48
+	bdnz	L(loop)
+
+	.p2align 5
+L(loop_64b):
+	lxv	  v1+32, 0(r4)  /* Load 4 quadwords.  */
+	lxv	  v2+32, 16(r4)
+	lxv	  v3+32, 32(r4)
+	lxv	  v4+32, 48(r4)
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
+	vminub	  v6,v3,v4
+	vminub	  v7,v5,v6
+	vcmpequb. v7,v7,v18  /* Check for NULLs.  */
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
+	bne	  cr6,L(vmx_zero)
+
+	lxv	  v1+32, 0(r4)  /* Load 4 quadwords.  */
+	lxv	  v2+32, 16(r4)
+	lxv	  v3+32, 32(r4)
+	lxv	  v4+32, 48(r4)
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
+	vminub	  v6,v3,v4
+	vminub	  v7,v5,v6
+	vcmpequb. v7,v7,v18  /* Check for NULLs.  */
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
+	bne	  cr6,L(vmx_zero)
+
+	lxv	  v1+32, 0(r4)  /* Load 4 quadwords.  */
+	lxv	  v2+32, 16(r4)
+	lxv	  v3+32, 32(r4)
+	lxv	  v4+32, 48(r4)
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
+	vminub	  v6,v3,v4
+	vminub	  v7,v5,v6
+	vcmpequb. v7,v7,v18  /* Check for NULLs.  */
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
+	beq	  cr6,L(loop_64b)
+
+L(vmx_zero):
+	/* OK, we found a null byte.  Let's look for it in the current 64-byte
+	   block and mark it in its corresponding VR.  */
+	vcmpequb  v1,v1,v18
+	vcmpequb  v2,v2,v18
+	vcmpequb  v3,v3,v18
+	vcmpequb  v4,v4,v18
+
+	/* We will now 'compress' the result into a single doubleword, so it
+	   can be moved to a GPR for the final calculation.  First, we
+	   generate an appropriate mask for vbpermq, so we can permute bits into
+	   the first halfword.  */
+	vspltisb  v10,3
+	lvsl	  v11,r0,r0
+	vslb	  v10,v11,v10
+
+	/* Permute the first bit of each byte into bits 48-63.  */
+	vbpermq	v1,v1,v10
+	vbpermq	v2,v2,v10
+	vbpermq	v3,v3,v10
+	vbpermq	v4,v4,v10
+
+	/* Shift each component into its correct position for merging.  */
+	vsldoi  v2,v2,v2,2
+	vsldoi  v3,v3,v3,4
+	vsldoi  v4,v4,v4,6
+
+	/* Merge the results and move to a GPR.  */
+	vor	v1,v2,v1
+	vor	v2,v3,v4
+	vor	v4,v1,v2
+	mfvrd	r10,v4
+
+	 /* Adjust address to the begninning of the current 64-byte block.  */
+	addi	r4,r4,-64
+
+	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
+	andc	r9, r9,r10
+	popcntd	r0, r9	      /* Count the bits in the mask.  */
+	subf	r5,r3,r4
+	add     r3,r5,r0      /* Compute final length.  */
+	blr
+
+L(tail1):
+	vctzlsbb r0,v6
+	add	r4,r4,r0
+	subf	r3,r3,r4
+	blr
+
+L(tail2):
+	vctzlsbb r0,v6
+	add	r4,r4,r0
+	addi	r4,r4,16
+	subf	r3,r3,r4
+	blr
+
+L(tail3):
+	vctzlsbb r0,v6
+	add	r4,r4,r0
+	addi	r4,r4,32
+	subf	r3,r3,r4
+	blr
+
+L(tail4):
+	vctzlsbb r0,v6
+	add	r4,r4,r0
+	addi	r4,r4,48
+	subf	r3,r3,r4
+	blr
+
+END (STRLEN)
+
+#ifdef DEFINE_STRLEN_HIDDEN_DEF
+weak_alias (__strlen, strlen)
+libc_hidden_builtin_def (strlen)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index fc2268f6b5..19acb6c64a 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@  sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
-		   rawmemchr-power9
+		   rawmemchr-power9 strlen-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 59a227ee22..ea10b00417 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -111,6 +111,10 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c.  */
   IFUNC_IMPL (i, name, strlen,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __strlen_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strlen_power8)
 	      IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
new file mode 100644
index 0000000000..68c8d54b5f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
@@ -0,0 +1,2 @@ 
+#define STRLEN __strlen_power9
+#include <sysdeps/powerpc/powerpc64/le/power9/strlen.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
index e587554221..cd9dc78a7c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
@@ -30,8 +30,13 @@  extern __typeof (__redirect_strlen) __libc_strlen;
 extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
+extern __typeof (__redirect_strlen) __strlen_power9 attribute_hidden;
 
 libc_ifunc (__libc_strlen,
+# ifdef __LITTLE_ENDIAN__
+	  (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+	  ? __strlen_power9 :
+# endif
 	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 	    ? __strlen_power8 :
 	      (hwcap & PPC_FEATURE_HAS_VSX)