[2/2] powerpc: Optimzed stpncpy for POWER9
Commit Message
Adds stpncpy support into the POWER9 strncpy.
---
sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 ++++++
sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 74 +++++++++++++++++++
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
.../powerpc64/multiarch/ifunc-impl-list.c | 5 ++
.../powerpc64/multiarch/stpncpy-power9.S | 24 ++++++
sysdeps/powerpc/powerpc64/multiarch/stpncpy.c | 7 ++
6 files changed, 135 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
Comments
Here is the make bench output:
generic_stpncpy __stpncpy_power9
__stpncpy_power8 __stpncpy_power7 __stpncpy_ppc
Length 16, n 16, alignment 1/ 1: 7.31792 2.79249 2.98207 6.20964
11.2262
Length 16, n 16, alignment 1/ 1: 7.26441 2.79883 2.97986 6.09795
11.1118
Length 16, n 16, alignment 1/ 2: 7.22475 2.82518 2.98169 6.18967
10.9933
Length 16, n 16, alignment 2/ 1: 7.28211 2.78851 3.1079 6.06067 10.4232
Length 2, n 4, alignment 7/ 2: 9.30193 2.4733 4.30086 4.74387 9.25328
Length 4, n 2, alignment 2/ 7: 6.7756 1.91031 2.93946 3.24475 7.76389
Length 2, n 4, alignment 7/ 2: 8.81319 2.4726 4.57341 4.74421 9.44667
Length 4, n 2, alignment 2/ 7: 6.77806 1.9118 2.93637 3.1857 7.00171
Length 16, n 16, alignment 2/ 2: 7.35335 2.80104 3.10653 5.85492
10.5689
Length 16, n 16, alignment 2/ 2: 7.14308 2.78571 3.10889 6.10044
10.4816
Length 16, n 16, alignment 2/ 4: 7.21628 2.81563 3.10724 6.14674
10.6005
Length 16, n 16, alignment 4/ 2: 7.47713 2.80531 3.80081 5.86977
9.43599
Length 4, n 8, alignment 6/ 4: 8.63537 2.4676 5.53825 4.1877 9.88309
Length 8, n 4, alignment 4/ 6: 6.63429 1.91051 3.10751 2.76472 8.4156
Length 4, n 8, alignment 6/ 4: 8.59304 2.43152 5.30288 4.16475
9.77498
Length 8, n 4, alignment 4/ 6: 6.63843 1.91047 3.19713 2.69566
8.67023
Length 16, n 16, alignment 3/ 3: 7.45277 2.80045 3.42433 6.06204
9.92282
Length 16, n 16, alignment 3/ 3: 8.04191 2.78645 3.43317 5.99773
10.0662
Length 16, n 16, alignment 3/ 6: 7.5816 2.81606 3.44168 6.0801 9.94673
Length 16, n 16, alignment 6/ 3: 7.10582 2.80176 5.03947 6.06942
8.40249
Length 8, n 16, alignment 5/ 6: 8.19747 2.42028 4.30043 5.0752 11.3093
Length 16, n 8, alignment 6/ 5: 6.37287 2.07239 2.56322 4.36972
6.52164
Length 8, n 16, alignment 5/ 6: 8.25022 2.45124 4.05051 5.02258
10.8683
Length 16, n 8, alignment 6/ 5: 6.31868 2.07215 2.83061 4.44584
7.14464
Length 16, n 16, alignment 4/ 4: 7.54408 2.80105 3.82846 5.71392
9.91359
Length 16, n 16, alignment 4/ 4: 7.66265 2.79063 3.86233 6.06489
9.31705
Length 16, n 16, alignment 4/ 0: 7.84286 2.79896 3.83148 6.08954
9.55253
Length 16, n 16, alignment 0/ 4: 7.36697 2.07019 2.66533 6.13894
7.75685
Length 16, n 32, alignment 4/ 0: 10.3819 3.33088 6.32994 7.24949
12.3827
Length 32, n 16, alignment 0/ 4: 7.15586 2.07172 2.66097 6.11743
7.56448
Length 16, n 32, alignment 4/ 0: 10.3262 3.35225 6.34556 7.3211 12.2527
Length 32, n 16, alignment 0/ 4: 7.13287 2.07265 2.6613 6.17878 7.61901
Length 16, n 16, alignment 5/ 5: 7.22471 2.80128 4.65776 6.15455
9.93333
Length 16, n 16, alignment 5/ 5: 7.22458 2.78586 4.65874 6.06763
9.87968
Length 16, n 16, alignment 5/ 2: 7.22718 2.79127 4.65999 6.025 10.3775
Length 16, n 16, alignment 2/ 5: 7.73485 2.8025 3.10754 6.08303 10.3871
Length 32, n 64, alignment 3/ 2: 13.7685 4.1256 7.04965 11.5105 15.3903
Length 64, n 32, alignment 2/ 3: 10.526 3.05149 3.59497 8.45078 13.7462
Length 32, n 64, alignment 3/ 2: 13.7681 4.11611 7.08236 11.5129
16.6004
Length 64, n 32, alignment 2/ 3: 10.962 3.05712 3.60447 8.43981 15.4906
Length 16, n 16, alignment 6/ 6: 7.30916 2.80056 5.03985 6.16331
8.43692
Length 16, n 16, alignment 6/ 6: 7.31688 2.7914 5.02931 6.12345 8.42848
Length 16, n 16, alignment 6/ 4: 7.7402 2.7993 5.04435 6.02685 8.28199
Length 16, n 16, alignment 4/ 6: 7.79103 2.82496 3.82464 6.0778 9.31532
Length 64, n 128, alignment 2/ 4: 15.4969 5.3714 8.09812 12.6067 18.7831
Length 128, n 64, alignment 4/ 2: 12.9023 3.93138 5.46487 10.7071
13.3253
Length 64, n 128, alignment 2/ 4: 15.4998 5.42611 7.88843 12.6007
24.0491
Length 128, n 64, alignment 4/ 2: 12.8971 3.94646 5.49689 11.1747
21.5779
Length 16, n 16, alignment 7/ 7: 7.68992 2.78151 6.14775 6.19397
8.38412
Length 16, n 16, alignment 7/ 7: 7.90811 2.7803 6.11502 6.17383 8.78371
Length 16, n 16, alignment 7/ 6: 7.45456 2.80173 5.93657 6.15191
8.38489
Length 16, n 16, alignment 6/ 7: 7.44846 2.80238 5.03654 6.1154 8.41589
Length 128, n 256, alignment 1/ 6: 17.9114 8.39532 10.3246 17.9457
21.9452
Length 256, n 128, alignment 6/ 1: 14.8346 5.41104 8.89047 13.5379
17.1437
Length 128, n 256, alignment 1/ 6: 17.9118 8.39985 10.3271 17.9503
42.0831
Length 256, n 128, alignment 6/ 1: 14.8306 5.40714 9.04492 13.5227 37.819
Length 8, n 16, alignment 0/ 0: 8.19945 2.46752 4.04264 4.62897
8.22975
Length 32, n 16, alignment 0/ 0: 7.23617 2.07229 2.66504 2.66683
7.93411
Length 8, n 16, alignment 7/ 2: 8.26373 2.41779 4.18003 5.31418 9.0473
Length 32, n 16, alignment 7/ 2: 7.46119 2.63992 6.16424 6.14534
7.28237
Length 16, n 32, alignment 0/ 0: 10.1282 3.42401 5.00287 5.02318
11.4985
Length 64, n 32, alignment 0/ 0: 9.29452 2.57779 2.79807 3.1362 10.9532
Length 16, n 32, alignment 6/ 4: 10.2194 3.30297 7.48371 10.4067
11.2264
Length 64, n 32, alignment 6/ 4: 10.6887 3.04976 5.13062 8.10511
11.1225
Length 32, n 64, alignment 0/ 0: 12.1806 4.09924 5.12341 6.14159
14.0965
Length 128, n 64, alignment 0/ 0: 10.1569 3.52625 3.88528 4.65782
11.3018
Length 32, n 64, alignment 5/ 6: 13.7795 4.13456 8.53476 10.2846
15.1556
Length 128, n 64, alignment 5/ 6: 12.8171 3.92765 5.82505 10.3559
15.0831
Length 64, n 128, alignment 0/ 0: 13.6328 5.33523 6.43324 7.92213
16.4658
Length 256, n 128, alignment 0/ 0: 8.92495 4.97169 7.13044 6.30158
12.9039
Length 64, n 128, alignment 4/ 0: 13.8393 5.36588 7.52682 11.5294
17.5523
Length 256, n 128, alignment 4/ 0: 13.5309 5.36019 7.56527 13.3503
17.8202
Length 128, n 256, alignment 0/ 0: 15.2956 8.14449 8.79678 9.69352
21.2463
Length 512, n 256, alignment 0/ 0: 11.5667 7.22974 10.1355 10.2592
21.5805
Length 128, n 256, alignment 3/ 2: 18.0152 8.21506 10.9175 20.4131
22.3927
Length 512, n 256, alignment 3/ 2: 18.7328 7.81909 11.251 25.0633 29.2378
Length 256, n 512, alignment 0/ 0: 17.5135 13.9768 15.6849 16.1219
30.9344
Length 1024, n 512, alignment 0/ 0: 17.988 11.8498 18.4388 18.7385 41.5762
Length 256, n 512, alignment 2/ 4: 23.3724 14.8026 15.9182 28.6762
33.9031
Length 1024, n 512, alignment 2/ 4: 27.9562 13.2785 19.5893 46.9671
52.4943
Length 512, n 1024, alignment 0/ 0: 23.3637 25.283 21.2536 23.4228 55.6501
Length 2048, n 1024, alignment 0/ 0: 31.303 21.2731 40.7001 38.8365 75.1105
Length 512, n 1024, alignment 1/ 6: 33.0535 26.873 24.8167 51.5917 56.236
Length 2048, n 1024, alignment 1/ 6: 47.5444 24.0206 42.5163 86.0245
92.5819
Thank you for your contributions, I have a few minor
comments/suggestions below.
On 8/20/20 1:29 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Adds stpncpy support into the POWER9 strncpy.
s/Adds/Add/ s/into the/to/.
Likewise, s/Optimzed/Add optimized/ in the title.
> ---
> sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 ++++++
> sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 74 +++++++++++++++++++
> sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
> .../powerpc64/multiarch/ifunc-impl-list.c | 5 ++
> .../powerpc64/multiarch/stpncpy-power9.S | 24 ++++++
> sysdeps/powerpc/powerpc64/multiarch/stpncpy.c | 7 ++
> 6 files changed, 135 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
>
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> new file mode 100644
> index 0000000000..a96840bb6f
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for PowerPC64/POWER9.
> + Copyright (C) 2015-2020 Free Software Foundation, Inc.
Should this date be exclusively 2020?
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define USE_AS_STPNCPY
> +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
> +
> +weak_alias (__stpncpy, stpncpy)
> +libc_hidden_def (__stpncpy)
> +libc_hidden_builtin_def (stpncpy)
OK.
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> index cde68384d4..64b06a9040 100644
> --- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -18,16 +18,30 @@
>
> #include <sysdep.h>
>
> +#ifdef USE_AS_STPNCPY
> +# ifndef STPNCPY
> +# define FUNC_NAME __stpncpy
> +# else
> +# define FUNC_NAME STPNCPY
> +# endif
> +#else
> # ifndef STRNCPY
> # define FUNC_NAME strncpy
> # else
> # define FUNC_NAME STRNCPY
> # endif
> +#endif /* !USE_AS_STPNCPY */
>
> /* Implements the function
>
> char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
>
> + or
> +
> + char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> + if USE_AS_STPNCPY is defined.
> +
> The implementation can load bytes past a null terminator, but only
> up to the next 16B boundary, so it never crosses a page. */
>
> @@ -47,6 +61,13 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
> beq L(zero_padding_loop)
>
> cmpwi r5,0
> +#ifdef USE_AS_STPNCPY
> + bgt L(cont)
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
"Compute pointer to last byte copied into dest." Likwise for the other
copied instances.
> + addi r3,r3,1
> + blr
> +#endif
OK.
> beqlr
This is unreachable in stpncpy, can this be conditionally included in
the !stpncpy configuration?
>
> L(cont):
> @@ -77,12 +98,22 @@ L(cont):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
>
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(null):
> sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
>
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r7
> +#endif
> add r11,r11,r8
> sub r5,r5,r8
> b L(zero_padding_loop)
> @@ -164,6 +195,11 @@ L(n_tail4):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,48 /* Offset */
> stxvl 32+v3,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_n_tail1):
> @@ -174,6 +210,11 @@ L(prep_n_tail1):
> L(n_tail1):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_n_tail2):
> @@ -186,6 +227,11 @@ L(n_tail2):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,16 /* offset */
> stxvl 32+v1,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_n_tail3):
> @@ -199,6 +245,11 @@ L(n_tail3):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,32 /* Offset */
> stxvl 32+v2,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_tail1):
> @@ -208,6 +259,11 @@ L(tail1):
> addi r9,r8,1 /* Add null terminator */
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
> b L(zero_padding_loop)
> @@ -222,6 +278,11 @@ L(tail2):
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,16 /* offset */
> stxvl 32+v1,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
> b L(zero_padding_loop)
> @@ -237,6 +298,11 @@ L(tail3):
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,32 /* offset */
> stxvl 32+v2,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
> b L(zero_padding_loop)
> @@ -252,6 +318,11 @@ L(tail4):
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,48 /* offset */
> stxvl 32+v3,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
>
> @@ -274,3 +345,6 @@ L(zero_padding_end):
> L(n_tail):
>
> END (FUNC_NAME)
> +#ifndef USE_AS_STPNCPY
> +libc_hidden_builtin_def (strncpy)
> +#endif
OK.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index cd2b47b403..f46bf50732 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
OK.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index aa63e1c23f..56790bcfe3 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
OK.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> new file mode 100644
> index 0000000000..ecbbb5c8e9
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for POWER9/PPC64.
> + Copyright (C) 2015-2020 Free Software Foundation, Inc.
Minor nit, I suspect that date should only include 2020.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define STPNCPY __stpncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
OK.
> +#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> index 17df886431..21702716a3 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> @@ -26,10 +26,17 @@
> extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
> extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
> extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
> +# endif
> # undef stpncpy
> # undef __stpncpy
>
> libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
> +# ifdef __LITTLE_ENDIAN__
> + (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> + ? __stpncpy_power9 :
> +# endif
> (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> ? __stpncpy_power8
> : (hwcap & PPC_FEATURE_HAS_VSX)
>
I think the spacing is off by two here.
new file mode 100644
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER9.
+ Copyright (C) 2015-2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
@@ -18,16 +18,30 @@
#include <sysdep.h>
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+# define FUNC_NAME __stpncpy
+# else
+# define FUNC_NAME STPNCPY
+# endif
+#else
# ifndef STRNCPY
# define FUNC_NAME strncpy
# else
# define FUNC_NAME STRNCPY
# endif
+#endif /* !USE_AS_STPNCPY */
/* Implements the function
char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+ or
+
+ char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ if USE_AS_STPNCPY is defined.
+
The implementation can load bytes past a null terminator, but only
up to the next 16B boundary, so it never crosses a page. */
@@ -47,6 +61,13 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
beq L(zero_padding_loop)
cmpwi r5,0
+#ifdef USE_AS_STPNCPY
+ bgt L(cont)
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ addi r3,r3,1
+ blr
+#endif
beqlr
L(cont):
@@ -77,12 +98,22 @@ L(cont):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r5
+#endif
blr
L(null):
sldi r10,r8,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r7
+#endif
add r11,r11,r8
sub r5,r5,r8
b L(zero_padding_loop)
@@ -164,6 +195,11 @@ L(n_tail4):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,48 /* Offset */
stxvl 32+v3,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail1):
@@ -174,6 +210,11 @@ L(prep_n_tail1):
L(n_tail1):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail2):
@@ -186,6 +227,11 @@ L(n_tail2):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,16 /* offset */
stxvl 32+v1,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail3):
@@ -199,6 +245,11 @@ L(n_tail3):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,32 /* Offset */
stxvl 32+v2,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r5
+#endif
blr
L(prep_tail1):
@@ -208,6 +259,11 @@ L(tail1):
addi r9,r8,1 /* Add null terminator */
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding_loop)
@@ -222,6 +278,11 @@ L(tail2):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,16 /* offset */
stxvl 32+v1,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding_loop)
@@ -237,6 +298,11 @@ L(tail3):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,32 /* offset */
stxvl 32+v2,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding_loop)
@@ -252,6 +318,11 @@ L(tail4):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,48 /* offset */
stxvl 32+v3,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
@@ -274,3 +345,6 @@ L(zero_padding_end):
L(n_tail):
END (FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
- rawmemchr-power9 strlen-power9 strncpy-power9
+ rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
@@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, stpncpy,
+ hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ __stpncpy_power9)
+#endif
IFUNC_IMPL_ADD (array, i, stpncpy,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__stpncpy_power8)
new file mode 100644
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for POWER9/PPC64.
+ Copyright (C) 2015-2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define STPNCPY __stpncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
@@ -26,10 +26,17 @@
extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
+# endif
# undef stpncpy
# undef __stpncpy
libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ ? __stpncpy_power9 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __stpncpy_power8
: (hwcap & PPC_FEATURE_HAS_VSX)