[v3,2/2] powerpc: Add optimized stpncpy for POWER9
Commit Message
Add stpncpy support into the POWER9 strncpy.
---
sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 +++++++
sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 68 ++++++++++++++++++-
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
.../powerpc64/multiarch/ifunc-impl-list.c | 6 ++
.../powerpc64/multiarch/stpncpy-power9.S | 29 ++++++++
sysdeps/powerpc/powerpc64/multiarch/stpncpy.c | 8 +++
6 files changed, 135 insertions(+), 2 deletions(-)
create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
Comments
generic_stpncpy __stpncpy_power9
__stpncpy_power8 __stpncpy_power7 __stpncpy_ppc
Length 16, n 16, alignment 1/ 1: 7.04141 2.66905 2.71071 5.33257
9.45193
Length 16, n 16, alignment 1/ 1: 7.01728 2.54349 2.70763 5.35555
9.40601
Length 16, n 16, alignment 1/ 2: 6.76331 2.56894 2.70649 5.28715
9.19534
Length 16, n 16, alignment 2/ 1: 6.41285 2.52953 2.86392 5.25868
9.24343
Length 2, n 4, alignment 7/ 2: 7.76627 2.36037 4.34749 4.05757
8.45648
Length 4, n 2, alignment 2/ 7: 6.15257 1.734 2.66932 2.81884 6.61486
Length 2, n 4, alignment 7/ 2: 7.69004 2.34779 3.90224 4.08693
8.51617
Length 4, n 2, alignment 2/ 7: 6.14888 1.73738 2.66929 2.81777
6.39066
Length 16, n 16, alignment 2/ 2: 7.25765 2.5434 2.8759 4.7084 9.43171
Length 16, n 16, alignment 2/ 2: 6.41274 2.52681 2.87939 5.2894 9.2505
Length 16, n 16, alignment 2/ 4: 6.74797 2.6683 2.82869 5.27608 9.43391
Length 16, n 16, alignment 4/ 2: 7.6281 2.54368 3.52982 5.26862 8.7369
Length 4, n 8, alignment 6/ 4: 7.79233 2.33099 5.64785 4.21131 9.03
Length 8, n 4, alignment 4/ 6: 6.01824 1.73782 2.81779 2.81777
7.90004
Length 4, n 8, alignment 6/ 4: 7.94851 2.33098 4.90456 3.75698
8.89379
Length 8, n 4, alignment 4/ 6: 6.0183 1.73715 2.81777 2.41521 7.83867
Length 16, n 16, alignment 3/ 3: 6.93178 2.66854 3.22004 5.31673
9.09542
Length 16, n 16, alignment 3/ 3: 6.99998 2.67084 3.22862 5.48294 9.2366
Length 16, n 16, alignment 3/ 6: 7.14689 2.6615 3.21888 5.25964 9.1277
Length 16, n 16, alignment 6/ 3: 6.46654 2.65885 4.57873 5.25391
7.75507
Length 8, n 16, alignment 5/ 6: 7.37286 2.33316 3.92971 4.50331
10.1496
Length 16, n 8, alignment 6/ 5: 5.73663 1.87991 2.633 4.09291 5.91732
Length 8, n 16, alignment 5/ 6: 7.77512 2.33361 3.67636 4.50091 10.147
Length 16, n 8, alignment 6/ 5: 5.73662 1.88001 2.57119 4.10496
6.15016
Length 16, n 16, alignment 4/ 4: 7.55115 2.65827 3.5838 5.25628 8.81586
Length 16, n 16, alignment 4/ 4: 7.61232 2.66851 3.62508 5.32044
8.73914
Length 16, n 16, alignment 4/ 0: 7.54588 2.54345 3.48987 5.27812
8.77989
Length 16, n 16, alignment 0/ 4: 6.82387 1.88425 2.41569 5.27746
7.19847
Length 16, n 32, alignment 4/ 0: 10.1135 3.10868 6.01894 6.66693
11.7681
Length 32, n 16, alignment 0/ 4: 6.93527 1.8793 2.4162 5.29155 6.50752
Length 16, n 32, alignment 4/ 0: 10.1565 3.16134 5.78062 6.81425
11.2226
Length 32, n 16, alignment 0/ 4: 6.76758 1.87928 2.41649 5.30161
7.22291
Length 16, n 16, alignment 5/ 5: 7.22753 2.56593 4.22659 5.30415
9.86703
Length 16, n 16, alignment 5/ 5: 6.76256 2.54348 4.23108 5.43866
9.53557
Length 16, n 16, alignment 5/ 2: 7.23702 2.52833 4.23011 5.26711
9.52126
Length 16, n 16, alignment 2/ 5: 6.68084 2.66311 2.84314 5.2709 9.24495
Length 32, n 64, alignment 3/ 2: 12.4989 3.84198 6.40671 10.4545 14.317
Length 64, n 32, alignment 2/ 3: 10.1464 2.78457 3.17933 7.67569
12.4356
Length 32, n 64, alignment 3/ 2: 12.4991 3.83968 7.08471 10.451 15.8984
Length 64, n 32, alignment 2/ 3: 9.61285 2.78401 3.18834 7.66606
13.9602
Length 16, n 16, alignment 6/ 6: 7.24557 2.66839 4.55951 5.25563 7.7369
Length 16, n 16, alignment 6/ 6: 6.76327 2.65836 4.5127 5.63264 7.80333
Length 16, n 16, alignment 6/ 4: 7.15127 2.54397 4.57355 5.32957
7.51005
Length 16, n 16, alignment 4/ 6: 7.51733 2.5615 3.67299 5.31244 8.73893
Length 64, n 128, alignment 2/ 4: 14.0745 4.98021 7.33878 11.4384
17.1572
Length 128, n 64, alignment 4/ 2: 11.7179 3.59088 4.89414 10.2021
11.6637
Length 64, n 128, alignment 2/ 4: 14.0841 4.99105 7.28507 11.4365
21.7537
Length 128, n 64, alignment 4/ 2: 11.7142 3.59211 4.83864 9.87632
19.4664
Length 16, n 16, alignment 7/ 7: 7.12738 2.53533 5.62213 5.30017
7.90888
Length 16, n 16, alignment 7/ 7: 6.82635 2.53529 5.60694 5.27111
8.88482
Length 16, n 16, alignment 7/ 6: 6.9193 2.54376 5.48117 5.24785 8.04263
Length 16, n 16, alignment 6/ 7: 6.89261 2.55078 4.51003 5.32471
7.81768
Length 128, n 256, alignment 1/ 6: 16.2686 7.68983 9.35727 16.2843
19.8458
Length 256, n 128, alignment 6/ 1: 13.4356 4.94899 7.94404 15.0122
15.0231
Length 128, n 256, alignment 1/ 6: 16.2511 7.69025 9.35528 16.2859
37.8453
Length 256, n 128, alignment 6/ 1: 13.4332 4.94446 8.02757 12.2879
34.1949
Length 8, n 16, alignment 0/ 0: 7.26102 2.33285 3.75702 3.85762
7.72869
Length 32, n 16, alignment 0/ 0: 7.049 1.88689 2.42187 2.41537 6.58192
Length 8, n 16, alignment 7/ 2: 8.09344 2.31269 3.67403 4.31612
8.21018
Length 32, n 16, alignment 7/ 2: 6.822 2.45733 5.59593 5.33252 6.53496
Length 16, n 32, alignment 0/ 0: 9.99648 3.36432 4.70547 4.55746 10.148
Length 64, n 32, alignment 0/ 0: 7.89408 2.4309 2.58854 2.70519 8.89171
Length 16, n 32, alignment 6/ 4: 9.31969 3.15547 7.24937 9.47362
10.0091
Length 64, n 32, alignment 6/ 4: 9.91687 2.78234 4.64259 7.00062
10.5972
Length 32, n 64, alignment 0/ 0: 11.0651 3.81484 4.4379 4.91663 11.8363
Length 128, n 64, alignment 0/ 0: 9.25821 3.20129 3.55296 4.22664
9.63556
Length 32, n 64, alignment 5/ 6: 12.5097 3.83422 7.29892 9.09849
13.2517
Length 128, n 64, alignment 5/ 6: 11.6165 3.60246 5.35542 8.90704
13.3207
Length 64, n 128, alignment 0/ 0: 12.372 4.91681 5.41951 6.91629 15.0813
Length 256, n 128, alignment 0/ 0: 7.93075 4.5247 6.29502 5.58357 12.5963
Length 64, n 128, alignment 4/ 0: 12.569 5.00092 7.25225 10.4764 15.9366
Length 256, n 128, alignment 4/ 0: 12.2963 4.90654 7.57109 12.0953
16.7672
Length 128, n 256, alignment 0/ 0: 13.9015 7.34814 7.88738 9.15353
19.4141
Length 512, n 256, alignment 0/ 0: 10.6865 6.52749 9.15011 9.71701
20.9021
Length 128, n 256, alignment 3/ 2: 16.3681 7.53318 9.89911 18.5309
20.8335
Length 512, n 256, alignment 3/ 2: 17.0249 7.10063 10.1568 22.6063
25.1262
Length 256, n 512, alignment 0/ 0: 16.5169 12.3406 13.6056 14.5875
29.2826
Length 1024, n 512, alignment 0/ 0: 16.3619 10.8422 16.7061 17.1025
37.7908
Length 256, n 512, alignment 2/ 4: 21.162 12.9621 14.3306 26.0856 30.0397
Length 1024, n 512, alignment 2/ 4: 25.5543 11.9978 17.7424 42.4293
47.7581
Length 512, n 1024, alignment 0/ 0: 20.5504 17.3132 19.5751 21.3633
42.7215
Length 2048, n 1024, alignment 0/ 0: 28.5197 19.3708 37.1801 35.3122
67.9792
Length 512, n 1024, alignment 1/ 6: 29.9875 17.7823 22.3228 47.3516
51.3697
Length 2048, n 1024, alignment 1/ 6: 42.9443 21.6004 38.7767 78.1732
83.9784
On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote:
> Add stpncpy support into the POWER9 strncpy.
The benchmark numbers you provided [1] seems to show it is slight worse than
the generic_strncpy which uses the same strategy as string/strncpy.c
(which would use VSX instruction through memset/memcpy). Did you compare this
optimization against an implementation that just call power8/9 memset/memcpy
instead?
It should resulting a smaller implementation which reduces i-cache size and
the code is much more simpler and maintainable. The same applies for stpncpy.
I tried to dissuade Intel developers that such micro-optimization are not
really a real gain and instead we should optimize only a handful of string
operations (memcpy/memset/etc.) and use composable implementation instead
(as generic strncpy). It still resulted on 1a153e47fcc, but I think we
might do better for powerpc.
[1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html
Hi Adhemerval,
On 30/09/2020 10:42, Adhemerval Zanella wrote:
>
>
> On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote:
>> Add stpncpy support into the POWER9 strncpy.
>
> The benchmark numbers you provided [1] seems to show it is slight worse than
> the generic_strncpy which uses the same strategy as string/strncpy.c
> (which would use VSX instruction through memset/memcpy).
My implementation is always better than the generic_strncpy, almost
three times better in average. And it calls memset as well.
Are you talking about __strncpy_ppc? For some reason it is using
strnlen_ppc instead of the strnlen_power8, but I didn't touch it.
> Did you compare this
> optimization against an implementation that just call power8/9 memset/memcpy
> instead?
>
Not sure if I understand, isn't that generic_strncpy and strncpy_ppc?
> It should resulting a smaller implementation which reduces i-cache size and
> the code is much more simpler and maintainable. The same applies for stpncpy.
>
> I tried to dissuade Intel developers that such micro-optimization are not
> really a real gain and instead we should optimize only a handful of string
> operations (memcpy/memset/etc.) and use composable implementation instead
> (as generic strncpy). It still resulted on 1a153e47fcc, but I think we
> might do better for powerpc.
>
> [1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html
>
Best Regards,
On 30/09/2020 11:21, Raphael M Zinsly wrote:
> Hi Adhemerval,
>
> On 30/09/2020 10:42, Adhemerval Zanella wrote:
>>
>>
>> On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote:
>>> Add stpncpy support into the POWER9 strncpy.
>>
>> The benchmark numbers you provided [1] seems to show it is slight worse than
>> the generic_strncpy which uses the same strategy as string/strncpy.c
>> (which would use VSX instruction through memset/memcpy).
>
> My implementation is always better than the generic_strncpy, almost three times better in average. And it calls memset as well.
>
> Are you talking about __strncpy_ppc? For some reason it is using strnlen_ppc instead of the strnlen_power8, but I didn't touch it.
>
>> Did you compare this
>> optimization against an implementation that just call power8/9 memset/memcpy
>> instead?
>>
>
> Not sure if I understand, isn't that generic_strncpy and strncpy_ppc?
Right, I misread the benchmark. And I tested my own suggestion on the power9
from gcc farm and it seems that although it is slight faster than power7
variant it does not really beat power8 (as expected since it calls strnlen and
then memcpy/memset and access the input twice).
I do not really oppose it and it is up to the arch maintainer, but I still think
these micro-optimizations tends to just add extra maintainability and icache
pressure where the microbenchmark does not really catch.
>
>
>> It should resulting a smaller implementation which reduces i-cache size and
>> the code is much more simpler and maintainable. The same applies for stpncpy.
>>
>> I tried to dissuade Intel developers that such micro-optimization are not
>> really a real gain and instead we should optimize only a handful of string
>> operations (memcpy/memset/etc.) and use composable implementation instead
>> (as generic strncpy). It still resulted on 1a153e47fcc, but I think we
>> might do better for powerpc.
>>
>> [1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html
>>
>
> Best Regards,
Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes:
> Add stpncpy support into the POWER9 strncpy.
Same reminder for Reviewed-by.
> +#define MEMSET __memset_power8
> +#ifdef SHARED
> +#define MEMSET_is_local
Wrong indentation here. Fixed.
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
Pushed as 7beee7b39ade.
Thanks!
new file mode 100644
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for POWER9 LE.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
@@ -18,11 +18,19 @@
#include <sysdep.h>
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+# define FUNC_NAME __stpncpy
+# else
+# define FUNC_NAME STPNCPY
+# endif
+#else
# ifndef STRNCPY
# define FUNC_NAME strncpy
# else
# define FUNC_NAME STRNCPY
# endif
+#endif /* !USE_AS_STPNCPY */
#ifndef MEMSET
/* For builds without IFUNC support, local calls should be made to internal
@@ -41,6 +49,12 @@
char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+ or
+
+ char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ if USE_AS_STPNCPY is defined.
+
The implementation can load bytes past a null terminator, but only
up to the next 16-byte aligned address, so it never crosses a page. */
@@ -66,7 +80,15 @@ ENTRY (FUNC_NAME, 4)
/* Empty/1-byte string optimization */
cmpdi r5,0
+#ifdef USE_AS_STPNCPY
+ bgt L(cont)
+ /* Compute pointer to last byte copied into dest. */
+ addi r3,r3,1
+ blr
+L(cont):
+#else
beqlr
+#endif
addi r4,r4,1
neg r7,r4
@@ -96,12 +118,20 @@ ENTRY (FUNC_NAME, 4)
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(null):
sldi r10,r8,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r7
+#endif
add r11,r11,r8
sub r5,r5,r8
b L(zero_padding)
@@ -185,6 +215,10 @@ L(n_tail4):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,48 /* Offset */
stxvl 32+v3,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail1):
@@ -196,6 +230,10 @@ L(prep_n_tail1):
L(n_tail1):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail2):
@@ -209,6 +247,10 @@ L(n_tail2):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,16 /* offset */
stxvl 32+v1,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail3):
@@ -223,6 +265,10 @@ L(n_tail3):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,32 /* Offset */
stxvl 32+v2,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_tail1):
@@ -232,6 +278,10 @@ L(tail1):
addi r9,r8,1 /* Add null terminator */
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding)
@@ -246,6 +296,10 @@ L(tail2):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,16 /* offset */
stxvl 32+v1,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding)
@@ -261,6 +315,10 @@ L(tail3):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,32 /* offset */
stxvl 32+v2,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding)
@@ -276,6 +334,10 @@ L(tail4):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,48 /* offset */
stxvl 32+v3,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
@@ -330,7 +392,8 @@ L(zero_padding_memset):
ld r0,FRAMESIZE+16(r1)
mr r3,r30 /* Restore the return value of strncpy, i.e.:
- dest. */
+ dest. For stpncpy, the return value is the
+ same as return value of memset. */
ld r30,FRAMESIZE-8(r1) /* Restore r30. */
/* Restore the stack frame. */
addi r1,r1,FRAMESIZE
@@ -341,3 +404,6 @@ L(zero_padding_memset):
blr
END (FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
- rawmemchr-power9 strlen-power9 strncpy-power9
+ rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
@@ -318,6 +318,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, stpncpy,
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ && (hwcap & PPC_FEATURE_HAS_VSX),
+ __stpncpy_power9)
+#endif
IFUNC_IMPL_ADD (array, i, stpncpy,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__stpncpy_power8)
new file mode 100644
@@ -0,0 +1,29 @@
+/* Optimized stpncpy implementation for POWER9 LE.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define STPNCPY __stpncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#define MEMSET __memset_power8
+#ifdef SHARED
+#define MEMSET_is_local
+#endif
+
+#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
@@ -26,10 +26,18 @@
extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
+# endif
# undef stpncpy
# undef __stpncpy
libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00) &&
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __stpncpy_power9 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __stpncpy_power8
: (hwcap & PPC_FEATURE_HAS_VSX)