[v3,1/2] powerpc: Add optimized strncpy for POWER9

Message ID 20200929152103.18564-1-rzinsly@linux.ibm.com
State Committed
Delegated to: Tulio Magno Quites Machado Filho
Headers
Series [v3,1/2] powerpc: Add optimized strncpy for POWER9 |

Commit Message

Raphael M Zinsly Sept. 29, 2020, 3:21 p.m. UTC
  Changes since v2:
	- Check for VSX support.
	- Calls memset for large numbers when padding with zeros.

---8<---

Similar to the strcpy P9 optimization, this version uses VSX to improve
performance.
---
 sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 343 ++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |   6 +
 .../powerpc64/multiarch/strncpy-power9.S      |  32 ++
 sysdeps/powerpc/powerpc64/multiarch/strncpy.c |   8 +
 5 files changed, 390 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
  

Comments

Raphael M Zinsly Sept. 29, 2020, 3:22 p.m. UTC | #1
Benchtest output:
                             	generic_strncpy	__strncpy_power9 
__strncpy_power8	__strncpy_power7	__strncpy_ppc
Length   16, n   16, alignment  1/ 1:	7.12492	2.55148	2.55079	5.71316 
9.05306
Length   16, n   16, alignment  1/ 1:	6.71919	2.51696	2.56187	5.92056 
9.43297
Length   16, n   16, alignment  1/ 2:	6.65909	2.53904	2.55074	5.6362	9.30194
Length   16, n   16, alignment  2/ 1:	6.50693	2.51671	2.82125	5.9298	9.18833
Length    2, n    4, alignment  7/ 2:	7.68477	2.27528	5.07192	4.8353	8.4619
Length    4, n    2, alignment  2/ 7:	6.03804	1.6644	2.32355	2.8178	6.27398
Length    2, n    4, alignment  7/ 2:	7.68944	2.31009	4.25078	4.83038 
8.59554
Length    4, n    2, alignment  2/ 7:	6.04246	1.66875	2.31775	2.73826	6.8358
Length   16, n   16, alignment  2/ 2:	6.50729	2.51669	2.83075	5.91498 
9.60274
Length   16, n   16, alignment  2/ 2:	6.3441	2.51684	2.82829	5.64233	9.29031
Length   16, n   16, alignment  2/ 4:	6.33989	2.51866	2.82089	5.59129 
9.50426
Length   16, n   16, alignment  4/ 2:	7.88012	2.51145	3.44369	5.91774 
9.50185
Length    4, n    8, alignment  6/ 4:	7.88965	2.27023	5.2189	4.67992	9.06714
Length    8, n    4, alignment  4/ 6:	5.8805	1.69238	2.67589	2.95865	7.70634
Length    4, n    8, alignment  6/ 4:	7.69107	2.29724	5.2196	4.68409	9.07751
Length    8, n    4, alignment  4/ 6:	6.33989	1.69501	2.67179	2.95862 
7.75311
Length   16, n   16, alignment  3/ 3:	6.58968	2.51681	3.14295	5.92364 
8.86981
Length   16, n   16, alignment  3/ 3:	6.76256	2.51385	3.14379	5.91558 
9.02347
Length   16, n   16, alignment  3/ 6:	6.76734	2.53841	3.08182	5.90924 
8.98558
Length   16, n   16, alignment  6/ 3:	6.67014	2.51618	4.16905	5.94761 
7.81751
Length    8, n   16, alignment  5/ 6:	7.70082	2.30026	4.59182	5.41689 
10.5428
Length   16, n    8, alignment  6/ 5:	5.63868	1.87873	2.32929	4.5053	5.78866
Length    8, n   16, alignment  5/ 6:	7.40013	2.2999	4.23768	5.41724	10.1649
Length   16, n    8, alignment  6/ 5:	5.63858	1.87872	2.32768	4.58045 
6.02812
Length   16, n   16, alignment  4/ 4:	7.37003	2.5167	3.50594	5.91125	8.93866
Length   16, n   16, alignment  4/ 4:	7.51015	2.51684	3.58684	5.91127 
8.60509
Length   16, n   16, alignment  4/ 0:	7.42056	2.51149	3.38179	5.92321 
8.86607
Length   16, n   16, alignment  0/ 4:	6.6704	1.87853	2.44519	5.91475	7.68788
Length   16, n   32, alignment  4/ 0:	11.0276	3.0727	6.01877	6.9094	11.4447
Length   32, n   16, alignment  0/ 4:	6.90919	1.87852	2.45708	5.91217	6.7671
Length   16, n   32, alignment  4/ 0:	9.76588	3.07257	5.92168	6.81253 
11.8936
Length   32, n   16, alignment  0/ 4:	6.90342	1.88296	2.44527	5.91673 
7.68469
Length   16, n   16, alignment  5/ 5:	6.90186	2.51712	3.91963	5.91852 
9.46308
Length   16, n   16, alignment  5/ 5:	6.58716	2.51626	3.94884	5.91303 
9.59648
Length   16, n   16, alignment  5/ 2:	6.92421	2.52057	3.80827	5.91558	9.3486
Length   16, n   16, alignment  2/ 5:	6.50526	2.53369	2.82035	5.91729	9.065
Length   32, n   64, alignment  3/ 2:	14.0395	3.79978	6.41657	11.19	13.9713
Length   64, n   32, alignment  2/ 3:	9.85699	2.75331	3.21559	8.23056 
11.4077
Length   32, n   64, alignment  3/ 2:	14.0923	3.8037	6.38851	11.4514	15.9838
Length   64, n   32, alignment  2/ 3:	9.4437	2.75344	3.21249	8.21276	13.9496
Length   16, n   16, alignment  6/ 6:	6.33989	2.51408	4.38486	5.91681 
7.37203
Length   16, n   16, alignment  6/ 6:	6.76503	2.51645	4.26454	5.9103	7.87574
Length   16, n   16, alignment  6/ 4:	6.51654	2.51654	4.24635	5.91578 
7.17827
Length   16, n   16, alignment  4/ 6:	7.28735	2.53335	3.54029	5.92337 
8.63075
Length   64, n  128, alignment  2/ 4:	15.4973	4.98808	7.34157	11.5113 
16.7688
Length  128, n   64, alignment  4/ 2:	11.6235	3.54914	4.80814	10.3103 
11.6194
Length   64, n  128, alignment  2/ 4:	15.4979	5.02559	7.28236	11.5045 
22.1309
Length  128, n   64, alignment  4/ 2:	11.6138	3.53841	4.80527	10.3293 
19.5239
Length   16, n   16, alignment  7/ 7:	6.84212	2.51109	5.0585	5.7457	7.2307
Length   16, n   16, alignment  7/ 7:	6.86215	2.50957	5.06541	5.91726 
8.55044
Length   16, n   16, alignment  7/ 6:	6.97428	2.51876	5.05053	5.92637 
7.07715
Length   16, n   16, alignment  6/ 7:	7.01347	2.53448	4.38004	5.93278 
7.86288
Length  128, n  256, alignment  1/ 6:	17.9407	7.92071	9.38384	16.9419 
20.6065
Length  256, n  128, alignment  6/ 1:	13.3609	4.7983	7.967	12.5699	14.9996
Length  128, n  256, alignment  1/ 6:	17.9371	7.69161	9.36672	16.739	38.9048
Length  256, n  128, alignment  6/ 1:	13.3632	4.87671	7.80194	12.7028 
33.9017
Length    8, n   16, alignment  0/ 0:	7.4529	2.29963	3.62737	4.22665	7.50268
Length   32, n   16, alignment  0/ 0:	6.86674	1.87853	2.45092	2.41528 
7.30161
Length    8, n   16, alignment  7/ 2:	7.40103	2.29399	3.75703	5.43637 
8.45285
Length   32, n   16, alignment  7/ 2:	7.72683	2.35278	5.04996	5.93629 
7.18881
Length   16, n   32, alignment  0/ 0:	9.87066	3.17511	4.89448	4.41405 
10.3408
Length   64, n   32, alignment  0/ 0:	8.06217	2.32926	2.94508	2.71275 
8.11769
Length   16, n   32, alignment  6/ 4:	9.50052	3.07627	6.37858	9.46793 
10.1393
Length   64, n   32, alignment  6/ 4:	9.7197	2.75154	4.47331	7.73667	9.26558
Length   32, n   64, alignment  0/ 0:	10.9157	3.79013	4.83041	4.97713 
11.5486
Length  128, n   64, alignment  0/ 0:	9.28057	3.15788	3.5178	4.23091	11.0874
Length   32, n   64, alignment  5/ 6:	14.0472	3.8515	7.26431	10.1343	12.8115
Length  128, n   64, alignment  5/ 6:	11.5493	3.5659	5.05553	9.1005	13.4053
Length   64, n  128, alignment  0/ 0:	12.0056	4.94615	6.45436	7.06235 
14.4743
Length  256, n  128, alignment  0/ 0:	7.87506	4.49546	6.4492	5.38877	12.1437
Length   64, n  128, alignment  4/ 0:	12.4174	4.99773	7.73749	11.1452 
16.1494
Length  256, n  128, alignment  4/ 0:	12.2601	4.88446	6.95948	13.3726 
16.7583
Length  128, n  256, alignment  0/ 0:	13.9215	7.51155	7.87942	8.79876 
20.4226
Length  512, n  256, alignment  0/ 0:	10.5798	6.77319	8.79757	9.03297 
20.0197
Length  128, n  256, alignment  3/ 2:	18.0213	7.57884	9.89436	18.7839 
20.5445
Length  512, n  256, alignment  3/ 2:	16.9909	7.07957	9.9271	23.2621	25.2442
Length  256, n  512, alignment  0/ 0:	17.6825	12.3074	13.3245	13.9381 
28.7687
Length 1024, n  512, alignment  0/ 0:	16.3837	10.8306	16.6999	16.6797 
38.0562
Length  256, n  512, alignment  2/ 4:	23.1953	13.0445	14.324	26.8918	30.2049
Length 1024, n  512, alignment  2/ 4:	25.4059	12.0938	17.2483	41.4883 
47.2025
Length  512, n 1024, alignment  0/ 0:	21.029	17.1782	19.4815	21.0035	43.2361
Length 2048, n 1024, alignment  0/ 0:	28.5154	19.3221	36.9624	35.482	68.4792
Length  512, n 1024, alignment  1/ 6:	32.4103	17.9272	21.5421	46.6099	55.059
Length 2048, n 1024, alignment  1/ 6:	43.0516	21.6315	37.8787	77.7889 
83.4195
  
Lucas A. M. Magalhaes Oct. 15, 2020, 3:20 p.m. UTC | #2
Hi Raphael,

Thanks for the patch. All tests passed on a P9.

LGTM.

---
Lucas A. M. Magalhães
  
Tulio Magno Quites Machado Filho Nov. 12, 2020, 5:09 p.m. UTC | #3
Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes:

> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.

Remember to add the Reviewed-by: lines you collected in previous versions. ;-)

> +#define FRAMESIZE (FRAME_MIN_SIZE+48)

I think you actually meant to use FRAME_MIN_SIZE+8 here.
Fixed.

> +L(zero_padding_end):
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	v18,r11,r10	/* Partial store  */
> +	blr
> +
> +	.align	4
> +L(zero_padding_memset):
> +	std	r30,-8(r1)   /* Save r30 on the stack.  */

This requires to add CFI:

	cfi_offset(r30, -8)

Done.

> +#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
> +#define STRNCPY __strncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +/* memset is used to pad the end of the string.  */
> +#define MEMSET __memset_power8
> +#ifdef SHARED
> +#define MEMSET_is_local

Wrong indentation in the previous lines.  Fixed.

I wonder if we can improve this and stop depending on the list of memset
implementations on this file.
Anyway, this isn't new and is a future work.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> index 7bacf28aca..3f2108ddae 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> @@ -28,11 +28,19 @@
>  extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
>  extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
>  extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
> +# endif
>  # undef strncpy
>  
>  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>   ifunc symbol properly. */
>  libc_ifunc_redirected (__redirect_strncpy, strncpy,
> +# ifdef __LITTLE_ENDIAN__
> +		     (hwcap2 & PPC_FEATURE2_ARCH_3_00) &&
> +		     (hwcap & PPC_FEATURE_HAS_VSX)
> +		     ? __strncpy_power9 :

Wrong indentation here.  Fixed.

Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>

Pushed as b9d83bf3eb57.

Thanks!
  

Patch

diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
new file mode 100644
index 0000000000..67cb648c65
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -0,0 +1,343 @@ 
+/* Optimized strncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+# ifndef STRNCPY
+#  define FUNC_NAME strncpy
+# else
+#  define FUNC_NAME STRNCPY
+# endif
+
+#ifndef MEMSET
+/* For builds without IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define MEMSET_is_local
+#  define MEMSET   __GI_memset
+# else
+#  define MEMSET   memset
+# endif
+#endif
+
+#define FRAMESIZE (FRAME_MIN_SIZE+48)
+
+/* Implements the function
+
+   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   The implementation can load bytes past a null terminator, but only
+   up to the next 16-byte aligned address, so it never crosses a page.  */
+
+.machine power9
+#ifdef MEMSET_is_local
+ENTRY_TOCLESS (FUNC_NAME, 4)
+#else
+ENTRY (FUNC_NAME, 4)
+#endif
+	CALL_MCOUNT 2
+
+	/* NULL string optimizations  */
+	cmpdi   r5, 0
+	beqlr
+
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+	addi	r11,r3,1
+	addi	r5,r5,-1
+	vspltisb v18,0		/* Zeroes in v18  */
+	cmpdi	r0,0
+	beq	L(zero_padding)
+
+	/* Empty/1-byte string optimization  */
+	cmpdi	r5,0
+	beqlr
+
+	addi	r4,r4,1
+	neg	r7,r4
+	rldicl	r9,r7,0,60	/* How many bytes to get source 16B aligned?  */
+
+	/* Get source 16B aligned  */
+	lvx	v0,0,r4
+	lvsr	v1,0,r4
+	vperm	v0,v18,v0,v1
+
+	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	vctzlsbb r7,v6		/* Number of trailing zeroes  */
+	addi	r8,r7,1		/* Add null terminator  */
+
+	/* r8 = bytes including null
+	   r9 = bytes to get source 16B aligned
+	   if r8 > r9
+	      no null, copy r9 bytes
+	   else
+	      there is a null, copy r8 bytes and return.  */
+	cmpld	r8,r9
+	bgt	L(no_null)
+
+	cmpld	cr6,r8,r5	/* r8 <= n?  */
+	ble	cr6,L(null)
+
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	blr
+
+L(null):
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	add	r11,r11,r8
+	sub	r5,r5,r8
+	b L(zero_padding)
+
+L(no_null):
+	cmpld	r9,r5		/* Check if length was reached.  */
+	bge	L(n_tail1)
+
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	add	r4,r4,r9
+	add	r11,r11,r9
+	sub	r5,r5,r9
+
+L(loop):
+	cmpldi	cr6,r5,64	/* Check if length was reached.  */
+	ble	cr6,L(final_loop)
+
+	lxv	32+v0,0(r4)
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail1)
+
+	lxv	32+v1,16(r4)
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail2)
+
+	lxv	32+v2,32(r4)
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail3)
+
+	lxv	32+v3,48(r4)
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail4)
+
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+
+	addi	r4,r4,64
+	addi	r11,r11,64
+	addi	r5,r5,-64
+
+	b	L(loop)
+
+L(final_loop):
+	cmpldi	cr5,r5,16
+	lxv	32+v0,0(r4)
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail1)
+	bne	cr6,L(count_tail1)
+	addi	r5,r5,-16
+
+	cmpldi	cr5,r5,16
+	lxv	32+v1,16(r4)
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail2)
+	bne	cr6,L(count_tail2)
+	addi	r5,r5,-16
+
+	cmpldi	cr5,r5,16
+	lxv	32+v2,32(r4)
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail3)
+	bne	cr6,L(count_tail3)
+	addi	r5,r5,-16
+
+	lxv	32+v3,48(r4)
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
+	beq	cr6,L(n_tail4)
+
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail4)
+
+L(n_tail4):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,48	/* Offset */
+	stxvl	32+v3,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail1):
+	beq	cr6,L(n_tail1)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail1)
+
+L(n_tail1):
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail2):
+	beq	cr6,L(n_tail2)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail2)
+
+L(n_tail2):
+	stxv	32+v0,0(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,16	/* offset */
+	stxvl	32+v1,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail3):
+	beq	cr6,L(n_tail3)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail3)
+
+L(n_tail3):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,32	/* Offset */
+	stxvl	32+v2,r11,r10	/* Partial store  */
+	blr
+
+L(prep_tail1):
+L(count_tail1):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail1):
+	addi	r9,r8,1		/* Add null terminator  */
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding)
+
+L(prep_tail2):
+	addi	r5,r5,-16
+L(count_tail2):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail2):
+	addi	r9,r8,1		/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,16	/* offset */
+	stxvl	32+v1,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding)
+
+L(prep_tail3):
+	addi	r5,r5,-32
+L(count_tail3):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail3):
+	addi	r9,r8,1		/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,32	/* offset */
+	stxvl	32+v2,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding)
+
+L(prep_tail4):
+	addi	r5,r5,-48
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail4):
+	addi	r9,r8,1		/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,48	/* offset */
+	stxvl	32+v3,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+
+/* This code pads the remainder of dest with NULL bytes.  For large numbers
+   memset gives a better performance, 255 was chosen through experimentation.
+   */
+L(zero_padding):
+	cmpldi	r5,255
+	bge	L(zero_padding_memset)
+
+L(zero_padding_loop):
+	cmpldi	cr6,r5,16	/* Check if length was reached.  */
+	ble	cr6,L(zero_padding_end)
+
+	stxv	v18,0(r11)
+	addi	r11,r11,16
+	addi	r5,r5,-16
+
+	b	L(zero_padding_loop)
+
+L(zero_padding_end):
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	v18,r11,r10	/* Partial store  */
+	blr
+
+	.align	4
+L(zero_padding_memset):
+	std	r30,-8(r1)   /* Save r30 on the stack.  */
+	mr	r30,r3       /* Save the return value of strncpy.  */
+	/* Prepare the call to memset.  */
+	mr	r3,r11       /* Pointer to the area to be zero-filled.  */
+	li	r4,0         /* Byte to be written (zero).  */
+
+	/* We delayed the creation of the stack frame, as well as the saving of
+	   the link register, because only at this point, we are sure that
+	   doing so is actually needed.  */
+
+	/* Save the link register.  */
+	mflr	r0
+	std	r0,16(r1)
+
+	/* Create the stack frame.  */
+	stdu	r1,-FRAMESIZE(r1)
+	cfi_adjust_cfa_offset(FRAMESIZE)
+	cfi_offset(lr, 16)
+
+	bl	MEMSET
+#ifndef MEMSET_is_local
+	nop
+#endif
+
+	ld	r0,FRAMESIZE+16(r1)
+
+	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
+				dest.  */
+	ld	r30,FRAMESIZE-8(r1) /* Restore r30.  */
+	/* Restore the stack frame.  */
+	addi	r1,r1,FRAMESIZE
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	/* Restore the link register.  */
+	mtlr	r0
+	cfi_restore(lr)
+	blr
+
+END (FUNC_NAME)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 19acb6c64a..cd2b47b403 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@  sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
-		   rawmemchr-power9 strlen-power9
+		   rawmemchr-power9 strlen-power9 strncpy-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index ea10b00417..fb55b07e53 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -301,6 +301,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+			      && (hwcap & PPC_FEATURE_HAS_VSX),
+			      __strncpy_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, strncpy,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
new file mode 100644
index 0000000000..68e1e8d925
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
@@ -0,0 +1,32 @@ 
+/* Optimized strncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRNCPY __strncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+/* memset is used to pad the end of the string.  */
+#define MEMSET __memset_power8
+#ifdef SHARED
+#define MEMSET_is_local
+#endif
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index 7bacf28aca..3f2108ddae 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -28,11 +28,19 @@ 
 extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
 extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
 extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
+# endif
 # undef strncpy
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
  ifunc symbol properly. */
 libc_ifunc_redirected (__redirect_strncpy, strncpy,
+# ifdef __LITTLE_ENDIAN__
+		     (hwcap2 & PPC_FEATURE2_ARCH_3_00) &&
+		     (hwcap & PPC_FEATURE_HAS_VSX)
+		     ? __strncpy_power9 :
+# endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __strncpy_power8
 		       : (hwcap & PPC_FEATURE_HAS_VSX)