diff mbox series

[v3,2/2] powerpc: Add optimized stpncpy for POWER9

Message ID 20200929152103.18564-2-rzinsly@linux.ibm.com
State Committed
Delegated to: Tulio Magno Quites Machado Filho
Headers show
Series [v3,1/2] powerpc: Add optimized strncpy for POWER9 | expand

Commit Message

Raphael M Zinsly Sept. 29, 2020, 3:21 p.m. UTC
Add stpncpy support into the POWER9 strncpy.
---
 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 +++++++
 sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 68 ++++++++++++++++++-
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |  2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |  6 ++
 .../powerpc64/multiarch/stpncpy-power9.S      | 29 ++++++++
 sysdeps/powerpc/powerpc64/multiarch/stpncpy.c |  8 +++
 6 files changed, 135 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S

Comments

Raphael M Zinsly Sept. 29, 2020, 3:23 p.m. UTC | #1
generic_stpncpy	__stpncpy_power9 
__stpncpy_power8	__stpncpy_power7	__stpncpy_ppc
Length   16, n   16, alignment  1/ 1:	7.04141	2.66905	2.71071	5.33257 
9.45193
Length   16, n   16, alignment  1/ 1:	7.01728	2.54349	2.70763	5.35555 
9.40601
Length   16, n   16, alignment  1/ 2:	6.76331	2.56894	2.70649	5.28715 
9.19534
Length   16, n   16, alignment  2/ 1:	6.41285	2.52953	2.86392	5.25868 
9.24343
Length    2, n    4, alignment  7/ 2:	7.76627	2.36037	4.34749	4.05757 
8.45648
Length    4, n    2, alignment  2/ 7:	6.15257	1.734	2.66932	2.81884	6.61486
Length    2, n    4, alignment  7/ 2:	7.69004	2.34779	3.90224	4.08693 
8.51617
Length    4, n    2, alignment  2/ 7:	6.14888	1.73738	2.66929	2.81777 
6.39066
Length   16, n   16, alignment  2/ 2:	7.25765	2.5434	2.8759	4.7084	9.43171
Length   16, n   16, alignment  2/ 2:	6.41274	2.52681	2.87939	5.2894	9.2505
Length   16, n   16, alignment  2/ 4:	6.74797	2.6683	2.82869	5.27608	9.43391
Length   16, n   16, alignment  4/ 2:	7.6281	2.54368	3.52982	5.26862	8.7369
Length    4, n    8, alignment  6/ 4:	7.79233	2.33099	5.64785	4.21131	9.03
Length    8, n    4, alignment  4/ 6:	6.01824	1.73782	2.81779	2.81777 
7.90004
Length    4, n    8, alignment  6/ 4:	7.94851	2.33098	4.90456	3.75698 
8.89379
Length    8, n    4, alignment  4/ 6:	6.0183	1.73715	2.81777	2.41521	7.83867
Length   16, n   16, alignment  3/ 3:	6.93178	2.66854	3.22004	5.31673 
9.09542
Length   16, n   16, alignment  3/ 3:	6.99998	2.67084	3.22862	5.48294	9.2366
Length   16, n   16, alignment  3/ 6:	7.14689	2.6615	3.21888	5.25964	9.1277
Length   16, n   16, alignment  6/ 3:	6.46654	2.65885	4.57873	5.25391 
7.75507
Length    8, n   16, alignment  5/ 6:	7.37286	2.33316	3.92971	4.50331 
10.1496
Length   16, n    8, alignment  6/ 5:	5.73663	1.87991	2.633	4.09291	5.91732
Length    8, n   16, alignment  5/ 6:	7.77512	2.33361	3.67636	4.50091	10.147
Length   16, n    8, alignment  6/ 5:	5.73662	1.88001	2.57119	4.10496 
6.15016
Length   16, n   16, alignment  4/ 4:	7.55115	2.65827	3.5838	5.25628	8.81586
Length   16, n   16, alignment  4/ 4:	7.61232	2.66851	3.62508	5.32044 
8.73914
Length   16, n   16, alignment  4/ 0:	7.54588	2.54345	3.48987	5.27812 
8.77989
Length   16, n   16, alignment  0/ 4:	6.82387	1.88425	2.41569	5.27746 
7.19847
Length   16, n   32, alignment  4/ 0:	10.1135	3.10868	6.01894	6.66693 
11.7681
Length   32, n   16, alignment  0/ 4:	6.93527	1.8793	2.4162	5.29155	6.50752
Length   16, n   32, alignment  4/ 0:	10.1565	3.16134	5.78062	6.81425 
11.2226
Length   32, n   16, alignment  0/ 4:	6.76758	1.87928	2.41649	5.30161 
7.22291
Length   16, n   16, alignment  5/ 5:	7.22753	2.56593	4.22659	5.30415 
9.86703
Length   16, n   16, alignment  5/ 5:	6.76256	2.54348	4.23108	5.43866 
9.53557
Length   16, n   16, alignment  5/ 2:	7.23702	2.52833	4.23011	5.26711 
9.52126
Length   16, n   16, alignment  2/ 5:	6.68084	2.66311	2.84314	5.2709	9.24495
Length   32, n   64, alignment  3/ 2:	12.4989	3.84198	6.40671	10.4545	14.317
Length   64, n   32, alignment  2/ 3:	10.1464	2.78457	3.17933	7.67569 
12.4356
Length   32, n   64, alignment  3/ 2:	12.4991	3.83968	7.08471	10.451	15.8984
Length   64, n   32, alignment  2/ 3:	9.61285	2.78401	3.18834	7.66606 
13.9602
Length   16, n   16, alignment  6/ 6:	7.24557	2.66839	4.55951	5.25563	7.7369
Length   16, n   16, alignment  6/ 6:	6.76327	2.65836	4.5127	5.63264	7.80333
Length   16, n   16, alignment  6/ 4:	7.15127	2.54397	4.57355	5.32957 
7.51005
Length   16, n   16, alignment  4/ 6:	7.51733	2.5615	3.67299	5.31244	8.73893
Length   64, n  128, alignment  2/ 4:	14.0745	4.98021	7.33878	11.4384 
17.1572
Length  128, n   64, alignment  4/ 2:	11.7179	3.59088	4.89414	10.2021 
11.6637
Length   64, n  128, alignment  2/ 4:	14.0841	4.99105	7.28507	11.4365 
21.7537
Length  128, n   64, alignment  4/ 2:	11.7142	3.59211	4.83864	9.87632 
19.4664
Length   16, n   16, alignment  7/ 7:	7.12738	2.53533	5.62213	5.30017 
7.90888
Length   16, n   16, alignment  7/ 7:	6.82635	2.53529	5.60694	5.27111 
8.88482
Length   16, n   16, alignment  7/ 6:	6.9193	2.54376	5.48117	5.24785	8.04263
Length   16, n   16, alignment  6/ 7:	6.89261	2.55078	4.51003	5.32471 
7.81768
Length  128, n  256, alignment  1/ 6:	16.2686	7.68983	9.35727	16.2843 
19.8458
Length  256, n  128, alignment  6/ 1:	13.4356	4.94899	7.94404	15.0122 
15.0231
Length  128, n  256, alignment  1/ 6:	16.2511	7.69025	9.35528	16.2859 
37.8453
Length  256, n  128, alignment  6/ 1:	13.4332	4.94446	8.02757	12.2879 
34.1949
Length    8, n   16, alignment  0/ 0:	7.26102	2.33285	3.75702	3.85762 
7.72869
Length   32, n   16, alignment  0/ 0:	7.049	1.88689	2.42187	2.41537	6.58192
Length    8, n   16, alignment  7/ 2:	8.09344	2.31269	3.67403	4.31612 
8.21018
Length   32, n   16, alignment  7/ 2:	6.822	2.45733	5.59593	5.33252	6.53496
Length   16, n   32, alignment  0/ 0:	9.99648	3.36432	4.70547	4.55746	10.148
Length   64, n   32, alignment  0/ 0:	7.89408	2.4309	2.58854	2.70519	8.89171
Length   16, n   32, alignment  6/ 4:	9.31969	3.15547	7.24937	9.47362 
10.0091
Length   64, n   32, alignment  6/ 4:	9.91687	2.78234	4.64259	7.00062 
10.5972
Length   32, n   64, alignment  0/ 0:	11.0651	3.81484	4.4379	4.91663	11.8363
Length  128, n   64, alignment  0/ 0:	9.25821	3.20129	3.55296	4.22664 
9.63556
Length   32, n   64, alignment  5/ 6:	12.5097	3.83422	7.29892	9.09849 
13.2517
Length  128, n   64, alignment  5/ 6:	11.6165	3.60246	5.35542	8.90704 
13.3207
Length   64, n  128, alignment  0/ 0:	12.372	4.91681	5.41951	6.91629	15.0813
Length  256, n  128, alignment  0/ 0:	7.93075	4.5247	6.29502	5.58357	12.5963
Length   64, n  128, alignment  4/ 0:	12.569	5.00092	7.25225	10.4764	15.9366
Length  256, n  128, alignment  4/ 0:	12.2963	4.90654	7.57109	12.0953 
16.7672
Length  128, n  256, alignment  0/ 0:	13.9015	7.34814	7.88738	9.15353 
19.4141
Length  512, n  256, alignment  0/ 0:	10.6865	6.52749	9.15011	9.71701 
20.9021
Length  128, n  256, alignment  3/ 2:	16.3681	7.53318	9.89911	18.5309 
20.8335
Length  512, n  256, alignment  3/ 2:	17.0249	7.10063	10.1568	22.6063 
25.1262
Length  256, n  512, alignment  0/ 0:	16.5169	12.3406	13.6056	14.5875 
29.2826
Length 1024, n  512, alignment  0/ 0:	16.3619	10.8422	16.7061	17.1025 
37.7908
Length  256, n  512, alignment  2/ 4:	21.162	12.9621	14.3306	26.0856	30.0397
Length 1024, n  512, alignment  2/ 4:	25.5543	11.9978	17.7424	42.4293 
47.7581
Length  512, n 1024, alignment  0/ 0:	20.5504	17.3132	19.5751	21.3633 
42.7215
Length 2048, n 1024, alignment  0/ 0:	28.5197	19.3708	37.1801	35.3122 
67.9792
Length  512, n 1024, alignment  1/ 6:	29.9875	17.7823	22.3228	47.3516 
51.3697
Length 2048, n 1024, alignment  1/ 6:	42.9443	21.6004	38.7767	78.1732 
83.9784
Adhemerval Zanella Sept. 30, 2020, 1:42 p.m. UTC | #2
On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote:
> Add stpncpy support into the POWER9 strncpy.

The benchmark numbers you provided [1] seems to show it is slight worse than
the generic_strncpy which uses the same strategy as string/strncpy.c 
(which would use VSX instruction through memset/memcpy).  Did you compare this
optimization against an implementation that just call power8/9 memset/memcpy
instead? 

It should resulting a smaller implementation which reduces i-cache size and
the code is much more simpler and maintainable.  The same applies for stpncpy.

I tried to dissuade Intel developers that such micro-optimization are not
really a real gain and instead we should optimize only a handful of string
operations (memcpy/memset/etc.) and use composable implementation instead
(as generic strncpy).  It still resulted on 1a153e47fcc, but I think we 
might do better for powerpc.

[1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html
Raphael M Zinsly Sept. 30, 2020, 2:21 p.m. UTC | #3
Hi Adhemerval,

On 30/09/2020 10:42, Adhemerval Zanella wrote:
> 
> 
> On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote:
>> Add stpncpy support into the POWER9 strncpy.
> 
> The benchmark numbers you provided [1] seems to show it is slight worse than
> the generic_strncpy which uses the same strategy as string/strncpy.c
> (which would use VSX instruction through memset/memcpy).

My implementation is always better than the generic_strncpy, almost 
three times better in average. And it calls memset as well.

Are you talking about __strncpy_ppc? For some reason it is using 
strnlen_ppc instead of the strnlen_power8, but I didn't touch it.

> Did you compare this
> optimization against an implementation that just call power8/9 memset/memcpy
> instead?
> 

Not sure if I understand, isn't that generic_strncpy and strncpy_ppc?


> It should resulting a smaller implementation which reduces i-cache size and
> the code is much more simpler and maintainable.  The same applies for stpncpy.
> 
> I tried to dissuade Intel developers that such micro-optimization are not
> really a real gain and instead we should optimize only a handful of string
> operations (memcpy/memset/etc.) and use composable implementation instead
> (as generic strncpy).  It still resulted on 1a153e47fcc, but I think we
> might do better for powerpc.
> 
> [1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html
> 

Best Regards,
Adhemerval Zanella Sept. 30, 2020, 2:46 p.m. UTC | #4
On 30/09/2020 11:21, Raphael M Zinsly wrote:
> Hi Adhemerval,
> 
> On 30/09/2020 10:42, Adhemerval Zanella wrote:
>>
>>
>> On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote:
>>> Add stpncpy support into the POWER9 strncpy.
>>
>> The benchmark numbers you provided [1] seems to show it is slight worse than
>> the generic_strncpy which uses the same strategy as string/strncpy.c
>> (which would use VSX instruction through memset/memcpy).
> 
> My implementation is always better than the generic_strncpy, almost three times better in average. And it calls memset as well.
> 
> Are you talking about __strncpy_ppc? For some reason it is using strnlen_ppc instead of the strnlen_power8, but I didn't touch it.
> 
>> Did you compare this
>> optimization against an implementation that just call power8/9 memset/memcpy
>> instead?
>>
> 
> Not sure if I understand, isn't that generic_strncpy and strncpy_ppc?


Right, I misread the benchmark.  And I tested my own suggestion on the power9
from gcc farm and it seems that although it is slight faster than power7
variant it does not really beat power8 (as expected since it calls strnlen and
then memcpy/memset and access the input twice).

I do not really oppose it and it is up to the arch maintainer, but I still think
these micro-optimizations tends to just add extra maintainability and icache
pressure where the microbenchmark does not really catch.

> 
> 
>> It should resulting a smaller implementation which reduces i-cache size and
>> the code is much more simpler and maintainable.  The same applies for stpncpy.
>>
>> I tried to dissuade Intel developers that such micro-optimization are not
>> really a real gain and instead we should optimize only a handful of string
>> operations (memcpy/memset/etc.) and use composable implementation instead
>> (as generic strncpy).  It still resulted on 1a153e47fcc, but I think we
>> might do better for powerpc.
>>
>> [1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html
>>
> 
> Best Regards,
Tulio Magno Quites Machado Filho Nov. 12, 2020, 5:12 p.m. UTC | #5
Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes:

> Add stpncpy support into the POWER9 strncpy.

Same reminder for Reviewed-by.

> +#define MEMSET __memset_power8
> +#ifdef SHARED
> +#define MEMSET_is_local

Wrong indentation here.  Fixed.

Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>

Pushed as 7beee7b39ade.

Thanks!
diff mbox series

Patch

diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
new file mode 100644
index 0000000000..81d9673d8b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
@@ -0,0 +1,24 @@ 
+/* Optimized stpncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
index 67cb648c65..b7d308c984 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -18,11 +18,19 @@ 
 
 #include <sysdep.h>
 
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+#   define FUNC_NAME __stpncpy
+# else
+#   define FUNC_NAME STPNCPY
+# endif
+#else
 # ifndef STRNCPY
 #  define FUNC_NAME strncpy
 # else
 #  define FUNC_NAME STRNCPY
 # endif
+#endif  /* !USE_AS_STPNCPY  */
 
 #ifndef MEMSET
 /* For builds without IFUNC support, local calls should be made to internal
@@ -41,6 +49,12 @@ 
 
    char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
 
+   or
+
+   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   if USE_AS_STPNCPY is defined.
+
    The implementation can load bytes past a null terminator, but only
    up to the next 16-byte aligned address, so it never crosses a page.  */
 
@@ -66,7 +80,15 @@  ENTRY (FUNC_NAME, 4)
 
 	/* Empty/1-byte string optimization  */
 	cmpdi	r5,0
+#ifdef USE_AS_STPNCPY
+	bgt	L(cont)
+	/* Compute pointer to last byte copied into dest.  */
+	addi	r3,r3,1
+	blr
+L(cont):
+#else
 	beqlr
+#endif
 
 	addi	r4,r4,1
 	neg	r7,r4
@@ -96,12 +118,20 @@  ENTRY (FUNC_NAME, 4)
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
 
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(null):
 	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
 
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r7
+#endif
 	add	r11,r11,r8
 	sub	r5,r5,r8
 	b L(zero_padding)
@@ -185,6 +215,10 @@  L(n_tail4):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,48	/* Offset */
 	stxvl	32+v3,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail1):
@@ -196,6 +230,10 @@  L(prep_n_tail1):
 L(n_tail1):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail2):
@@ -209,6 +247,10 @@  L(n_tail2):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,16	/* offset */
 	stxvl	32+v1,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail3):
@@ -223,6 +265,10 @@  L(n_tail3):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,32	/* Offset */
 	stxvl	32+v2,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_tail1):
@@ -232,6 +278,10 @@  L(tail1):
 	addi	r9,r8,1		/* Add null terminator  */
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding)
@@ -246,6 +296,10 @@  L(tail2):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,16	/* offset */
 	stxvl	32+v1,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding)
@@ -261,6 +315,10 @@  L(tail3):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,32	/* offset */
 	stxvl	32+v2,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding)
@@ -276,6 +334,10 @@  L(tail4):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,48	/* offset */
 	stxvl	32+v3,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 
@@ -330,7 +392,8 @@  L(zero_padding_memset):
 	ld	r0,FRAMESIZE+16(r1)
 
 	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
-				dest.  */
+				dest.  For stpncpy, the return value is the
+				same as return value of memset.  */
 	ld	r30,FRAMESIZE-8(r1) /* Restore r30.  */
 	/* Restore the stack frame.  */
 	addi	r1,r1,FRAMESIZE
@@ -341,3 +404,6 @@  L(zero_padding_memset):
 	blr
 
 END (FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index cd2b47b403..f46bf50732 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@  sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
-		   rawmemchr-power9 strlen-power9 strncpy-power9
+		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index fb55b07e53..d0f20cc97f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -318,6 +318,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+			      && (hwcap & PPC_FEATURE_HAS_VSX),
+			      __stpncpy_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __stpncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
new file mode 100644
index 0000000000..55daa3455f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
@@ -0,0 +1,29 @@ 
+/* Optimized stpncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define STPNCPY __stpncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#define MEMSET __memset_power8
+#ifdef SHARED
+#define MEMSET_is_local
+#endif
+
+#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index 17df886431..3758f29ad1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -26,10 +26,18 @@ 
 extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
+# endif
 # undef stpncpy
 # undef __stpncpy
 
 libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
+# ifdef __LITTLE_ENDIAN__
+		     (hwcap2 & PPC_FEATURE2_ARCH_3_00) &&
+		     (hwcap & PPC_FEATURE_HAS_VSX)
+		     ? __stpncpy_power9 :
+# endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __stpncpy_power8
 		       : (hwcap & PPC_FEATURE_HAS_VSX)