[v2,2/2] powerpc: Add optimized stpncpy for POWER9
Commit Message
Add stpncpy support into the POWER9 strncpy.
---
sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 +++++++
sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 65 +++++++++++++++++++
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
.../powerpc64/multiarch/ifunc-impl-list.c | 5 ++
.../powerpc64/multiarch/stpncpy-power9.S | 24 +++++++
sysdeps/powerpc/powerpc64/multiarch/stpncpy.c | 7 ++
6 files changed, 126 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
Comments
Benchtest output:
generic_stpncpy __stpncpy_power9
__stpncpy_power8 __stpncpy_power7 __stpncpy_ppc
Length 16, n 16, alignment 1/ 1: 6.55566 2.5481 2.74063 5.28665 9.96288
Length 16, n 16, alignment 1/ 1: 6.70016 2.54137 2.7108 4.77502 9.91703
Length 16, n 16, alignment 1/ 2: 6.55975 2.56295 2.70641 5.49298
9.59591
Length 16, n 16, alignment 2/ 1: 6.90759 2.52713 2.854 5.48949 9.37664
Length 2, n 4, alignment 7/ 2: 7.90969 2.22698 3.90151 4.6461 8.4503
Length 4, n 2, alignment 2/ 7: 6.14855 1.73403 2.67338 3.05675
6.86316
Length 2, n 4, alignment 7/ 2: 8.40868 2.22338 4.50838 4.51078
9.28489
Length 4, n 2, alignment 2/ 7: 6.14849 1.73402 2.67225 2.85349
6.34342
Length 16, n 16, alignment 2/ 2: 6.963 2.54442 2.87779 5.63547 9.85162
Length 16, n 16, alignment 2/ 2: 6.59452 2.54121 2.84662 5.57178
9.51406
Length 16, n 16, alignment 2/ 4: 6.79115 2.55835 2.84836 5.50427
9.67999
Length 16, n 16, alignment 4/ 2: 6.78419 2.54132 3.54229 5.52563
8.50938
Length 4, n 8, alignment 6/ 4: 8.45703 2.17266 4.80507 3.8714 9.04725
Length 8, n 4, alignment 4/ 6: 6.01753 1.73761 2.8185 2.41527 8.00051
Length 4, n 8, alignment 6/ 4: 7.82081 2.22612 4.80057 3.76103
8.99812
Length 8, n 4, alignment 4/ 6: 6.01752 1.73474 2.82089 2.41524
7.82703
Length 16, n 16, alignment 3/ 3: 6.78194 2.54143 3.21392 5.46447
8.90749
Length 16, n 16, alignment 3/ 3: 6.76324 2.54088 3.22883 5.39689
9.14749
Length 16, n 16, alignment 3/ 6: 7.05278 2.55795 3.22243 5.53422
9.11315
Length 16, n 16, alignment 6/ 3: 6.72881 2.54183 4.58459 5.51658
7.85006
Length 8, n 16, alignment 5/ 6: 7.67184 2.23969 4.13269 4.90728
10.2248
Length 16, n 8, alignment 6/ 5: 5.73672 1.88048 2.6693 4.35579 6.11674
Length 8, n 16, alignment 5/ 6: 7.51707 2.2284 3.67276 4.90637 10.2411
Length 16, n 8, alignment 6/ 5: 5.73665 1.88119 2.57514 3.96351
6.16253
Length 16, n 16, alignment 4/ 4: 7.03577 2.5415 3.66445 4.94157 8.98371
Length 16, n 16, alignment 4/ 4: 6.93549 2.53033 3.65577 5.53815
8.48335
Length 16, n 16, alignment 4/ 0: 6.95106 2.53483 3.48744 5.43759
8.45425
Length 16, n 16, alignment 0/ 4: 6.44601 1.87936 2.41984 5.49488
6.92169
Length 16, n 32, alignment 4/ 0: 9.2036 3.04122 5.78685 6.66434 10.9065
Length 32, n 16, alignment 0/ 4: 6.65504 1.87934 2.41817 6.08706
6.98513
Length 16, n 32, alignment 4/ 0: 9.17461 3.04153 5.77758 6.66444
10.8015
Length 32, n 16, alignment 0/ 4: 6.44123 1.87936 2.41847 5.55207
6.86039
Length 16, n 16, alignment 5/ 5: 6.56005 2.53132 4.22362 5.43527
9.25109
Length 16, n 16, alignment 5/ 5: 6.55552 2.53088 4.22655 5.59271
9.61369
Length 16, n 16, alignment 5/ 2: 6.55553 2.54559 4.31135 5.47438
8.83103
Length 16, n 16, alignment 2/ 5: 6.88992 2.56255 2.84059 5.23185
9.51441
Length 32, n 64, alignment 3/ 2: 12.5054 3.75138 6.42457 10.4719
15.0663
Length 64, n 32, alignment 2/ 3: 9.87185 2.78283 3.17042 7.66624 11.503
Length 32, n 64, alignment 3/ 2: 12.4999 3.74537 6.38161 10.4578
15.1104
Length 64, n 32, alignment 2/ 3: 9.86495 2.77889 3.19171 7.63272
13.9799
Length 16, n 16, alignment 6/ 6: 6.41353 2.5453 4.50915 5.30382 8.45391
Length 16, n 16, alignment 6/ 6: 6.49495 2.54119 4.54493 5.55909 8.1629
Length 16, n 16, alignment 6/ 4: 6.41743 2.54487 4.57202 4.98659
7.53033
Length 16, n 16, alignment 4/ 6: 6.91724 2.54649 3.67868 5.36838
8.45677
Length 64, n 128, alignment 2/ 4: 14.0687 4.93151 8.11667 11.4411
16.9533
Length 128, n 64, alignment 4/ 2: 11.7134 3.58948 4.90121 10.3018
11.6692
Length 64, n 128, alignment 2/ 4: 14.0677 4.93413 7.28129 11.439 22.2186
Length 128, n 64, alignment 4/ 2: 11.7149 3.59312 4.85286 10.3403
19.4651
Length 16, n 16, alignment 7/ 7: 6.76501 2.52563 5.55792 5.44155
8.39997
Length 16, n 16, alignment 7/ 7: 7.16923 2.5265 5.55148 5.60184 7.98311
Length 16, n 16, alignment 7/ 6: 6.76252 2.52629 5.48067 5.51161
7.61026
Length 16, n 16, alignment 6/ 7: 6.65772 2.5521 4.55758 5.48893 7.7301
Length 128, n 256, alignment 1/ 6: 16.2494 7.62034 9.3616 16.2888 19.7029
Length 256, n 128, alignment 6/ 1: 13.4311 4.94455 8.10802 12.2681
15.6941
Length 128, n 256, alignment 1/ 6: 16.2608 7.6209 9.35509 16.2856 38.0277
Length 256, n 128, alignment 6/ 1: 13.4327 4.89474 8.35934 12.2646
34.3268
Length 8, n 16, alignment 0/ 0: 7.20671 2.23256 3.75778 5.63555
7.36414
Length 32, n 16, alignment 0/ 0: 6.4449 1.88 2.41577 2.89598 6.42537
Length 8, n 16, alignment 7/ 2: 7.45976 2.21832 3.91671 4.6524 8.45825
Length 32, n 16, alignment 7/ 2: 6.78267 2.34296 5.59161 5.58598
6.88842
Length 16, n 32, alignment 0/ 0: 9.47971 3.10847 4.74758 4.75377
10.2238
Length 64, n 32, alignment 0/ 0: 8.45634 2.34747 2.59248 2.82356
9.42305
Length 16, n 32, alignment 6/ 4: 9.37784 3.05067 6.92384 9.47727
10.1826
Length 64, n 32, alignment 6/ 4: 9.89233 2.77968 4.63672 7.09838
10.2804
Length 32, n 64, alignment 0/ 0: 11.0813 3.71086 4.43777 5.3549 12.2048
Length 128, n 64, alignment 0/ 0: 9.25192 3.20123 3.53388 4.50794
10.1934
Length 32, n 64, alignment 5/ 6: 12.5099 3.75871 7.29613 9.64902
13.5821
Length 128, n 64, alignment 5/ 6: 11.6115 3.60165 5.71818 9.07288
12.7929
Length 64, n 128, alignment 0/ 0: 12.3671 4.80754 5.46926 6.84492
14.9238
Length 256, n 128, alignment 0/ 0: 8.08427 4.52607 6.47996 5.92086 11.701
Length 64, n 128, alignment 4/ 0: 12.5692 4.89717 7.11058 10.472 15.875
Length 256, n 128, alignment 4/ 0: 12.2945 4.94163 7.11645 12.3831
16.6219
Length 128, n 256, alignment 0/ 0: 13.8948 7.28911 7.78784 9.30215
17.0358
Length 512, n 256, alignment 0/ 0: 10.5266 6.56481 9.14202 9.31096
20.0531
Length 128, n 256, alignment 3/ 2: 16.3534 7.46332 9.90009 18.5282
19.5969
Length 512, n 256, alignment 3/ 2: 17.0519 7.09947 10.1635 23.5411
25.0043
Length 256, n 512, alignment 0/ 0: 15.8935 12.6195 14.0756 14.7553
28.5299
Length 1024, n 512, alignment 0/ 0: 16.3758 10.8028 16.5447 16.8966
37.8653
Length 256, n 512, alignment 2/ 4: 21.16 13.2779 14.3088 26.4475 30.1647
Length 1024, n 512, alignment 2/ 4: 25.3364 12.0899 17.5443 42.7216
47.5803
Length 512, n 1024, alignment 0/ 0: 20.5111 22.9782 19.6648 21.3857
42.4801
Length 2048, n 1024, alignment 0/ 0: 28.4023 19.1577 36.9065 35.4799
68.3555
Length 512, n 1024, alignment 1/ 6: 29.9694 24.3087 22.0513 46.7436
51.5908
Length 2048, n 1024, alignment 1/ 6: 42.9897 21.5402 38.739 78.3266 84.3956
On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote:
> Benchtest output:
> generic_stpncpy __stpncpy_power9 __stpncpy_power8 __stpncpy_power7 __stpncpy_ppc
<snip>
> Length 512, n 1024, alignment 0/ 0: 20.5111 22.9782 19.6648 21.3857 42.4801
<snip>
> Length 512, n 1024, alignment 1/ 6: 29.9694 24.3087 22.0513 46.7436 51.5908
These two seem to be the only cases in which the power9 version loses to
the power8 one. Have you investigated what happens in these two specific
cases?
On 9/4/20 1:56 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Add stpncpy support into the POWER9 strncpy.
> ---
> sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 +++++++
> sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 65 +++++++++++++++++++
> sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
> .../powerpc64/multiarch/ifunc-impl-list.c | 5 ++
> .../powerpc64/multiarch/stpncpy-power9.S | 24 +++++++
> sysdeps/powerpc/powerpc64/multiarch/stpncpy.c | 7 ++
> 6 files changed, 126 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
>
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> new file mode 100644
> index 0000000000..81d9673d8b
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for POWER9 LE.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define USE_AS_STPNCPY
> +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
> +
> +weak_alias (__stpncpy, stpncpy)
> +libc_hidden_def (__stpncpy)
> +libc_hidden_builtin_def (stpncpy)
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> index 34fcdee913..f7265b11ec 100644
> --- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -18,16 +18,30 @@
>
> #include <sysdep.h>
>
> +#ifdef USE_AS_STPNCPY
> +# ifndef STPNCPY
> +# define FUNC_NAME __stpncpy
> +# else
> +# define FUNC_NAME STPNCPY
> +# endif
> +#else
> # ifndef STRNCPY
> # define FUNC_NAME strncpy
> # else
> # define FUNC_NAME STRNCPY
> # endif
> +#endif /* !USE_AS_STPNCPY */
>
> /* Implements the function
>
> char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
>
> + or
> +
> + char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> + if USE_AS_STPNCPY is defined.
> +
> The implementation can load bytes past a null terminator, but only
> up to the next 16-byte aligned address, so it never crosses a page. */
>
> @@ -49,7 +63,15 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>
> /* Empty/1-byte string optimization */
> cmpdi r5,0
> +#ifdef USE_AS_STPNCPY
> + bgt L(cont)
> + /* Compute pointer to last byte copied into dest. */
> + addi r3,r3,1
> + blr
> +L(cont):
> +#else
> beqlr
> +#endif
>
> addi r4,r4,1
> neg r7,r4
> @@ -79,12 +101,20 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
>
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(null):
> sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
>
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r7
> +#endif
> add r11,r11,r8
> sub r5,r5,r8
> b L(zero_padding_loop)
> @@ -168,6 +198,10 @@ L(n_tail4):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,48 /* Offset */
> stxvl 32+v3,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_n_tail1):
> @@ -179,6 +213,10 @@ L(prep_n_tail1):
> L(n_tail1):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_n_tail2):
> @@ -192,6 +230,10 @@ L(n_tail2):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,16 /* offset */
> stxvl 32+v1,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_n_tail3):
> @@ -206,6 +248,10 @@ L(n_tail3):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,32 /* Offset */
> stxvl 32+v2,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_tail1):
> @@ -215,6 +261,10 @@ L(tail1):
> addi r9,r8,1 /* Add null terminator */
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
> b L(zero_padding_loop)
> @@ -229,6 +279,10 @@ L(tail2):
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,16 /* offset */
> stxvl 32+v1,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
> b L(zero_padding_loop)
> @@ -244,6 +298,10 @@ L(tail3):
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,32 /* offset */
> stxvl 32+v2,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
> b L(zero_padding_loop)
> @@ -259,6 +317,10 @@ L(tail4):
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,48 /* offset */
> stxvl 32+v3,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
>
> @@ -279,3 +341,6 @@ L(zero_padding_end):
> blr
>
> END (FUNC_NAME)
> +#ifndef USE_AS_STPNCPY
> +libc_hidden_builtin_def (strncpy)
> +#endif
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index cd2b47b403..f46bf50732 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
>
> ifneq (,$(filter %le,$(config-machine)))
> sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
> - rawmemchr-power9 strlen-power9 strncpy-power9
> + rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
> endif
> CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
> CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index aa63e1c23f..56790bcfe3 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */
> IFUNC_IMPL (i, name, stpncpy,
> +#ifdef __LITTLE_ENDIAN__
> + IFUNC_IMPL_ADD (array, i, stpncpy,
> + hwcap2 & PPC_FEATURE2_ARCH_3_00,
> + __stpncpy_power9)
> +#endif
> IFUNC_IMPL_ADD (array, i, stpncpy,
> hwcap2 & PPC_FEATURE2_ARCH_2_07,
> __stpncpy_power8)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> new file mode 100644
> index 0000000000..ccbab55c31
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for POWER9 LE.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define STPNCPY __stpncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> index 17df886431..ac17b26650 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> @@ -26,10 +26,17 @@
> extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
> extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
> extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
> +# endif
> # undef stpncpy
> # undef __stpncpy
>
> libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
> +# ifdef __LITTLE_ENDIAN__
> + (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> + ? __stpncpy_power9 :
> +# endif
> (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> ? __stpncpy_power8
> : (hwcap & PPC_FEATURE_HAS_VSX)
>
LGTM.
Reviewed-by: Matheus Castanho <msc@linux.ibm.com>
--
Matheus Castanho
Hi Matheus,
On 16/09/2020 09:32, Matheus Castanho wrote:
> On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote:
>> Benchtest output:
>> generic_stpncpy __stpncpy_power9 __stpncpy_power8 __stpncpy_power7 __stpncpy_ppc
> <snip>
>> Length 512, n 1024, alignment 0/ 0: 20.5111 22.9782 19.6648 21.3857 42.4801
> <snip>
>> Length 512, n 1024, alignment 1/ 6: 29.9694 24.3087 22.0513 46.7436 51.5908
>
> These two seem to be the only cases in which the power9 version loses to
> the power8 one. Have you investigated what happens in these two specific
> cases?
>
Yes the power8 optimization calls memset to do the zero padding at the
end if n > length. In this case where n is way higher, memset is faster
than the loop used in my implementation.
Thanks for the review!
Regards,
On Wed, Sep 16, 2020 at 09:56:59AM -0300, Raphael M Zinsly via Libc-alpha wrote:
> On 16/09/2020 09:32, Matheus Castanho wrote:
> > On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote:
> > > Benchtest output:
> > > generic_stpncpy __stpncpy_power9 __stpncpy_power8 __stpncpy_power7 __stpncpy_ppc
> > <snip>
> > > Length 512, n 1024, alignment 0/ 0: 20.5111 22.9782 19.6648 21.3857 42.4801
> > <snip>
> > > Length 512, n 1024, alignment 1/ 6: 29.9694 24.3087 22.0513 46.7436 51.5908
> >
> > These two seem to be the only cases in which the power9 version loses to
> > the power8 one. Have you investigated what happens in these two specific
> > cases?
> >
> Yes the power8 optimization calls memset to do the zero padding at the end
> if n > length. In this case where n is way higher, memset is faster than the
> loop used in my implementation.
Is there some sort of threshold that would help these cases by transitioning
to memset (or replicating the relevant part of that code here?
PC
new file mode 100644
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for POWER9 LE.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
@@ -18,16 +18,30 @@
#include <sysdep.h>
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+# define FUNC_NAME __stpncpy
+# else
+# define FUNC_NAME STPNCPY
+# endif
+#else
# ifndef STRNCPY
# define FUNC_NAME strncpy
# else
# define FUNC_NAME STRNCPY
# endif
+#endif /* !USE_AS_STPNCPY */
/* Implements the function
char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+ or
+
+ char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ if USE_AS_STPNCPY is defined.
+
The implementation can load bytes past a null terminator, but only
up to the next 16-byte aligned address, so it never crosses a page. */
@@ -49,7 +63,15 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
/* Empty/1-byte string optimization */
cmpdi r5,0
+#ifdef USE_AS_STPNCPY
+ bgt L(cont)
+ /* Compute pointer to last byte copied into dest. */
+ addi r3,r3,1
+ blr
+L(cont):
+#else
beqlr
+#endif
addi r4,r4,1
neg r7,r4
@@ -79,12 +101,20 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(null):
sldi r10,r8,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r7
+#endif
add r11,r11,r8
sub r5,r5,r8
b L(zero_padding_loop)
@@ -168,6 +198,10 @@ L(n_tail4):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,48 /* Offset */
stxvl 32+v3,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail1):
@@ -179,6 +213,10 @@ L(prep_n_tail1):
L(n_tail1):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail2):
@@ -192,6 +230,10 @@ L(n_tail2):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,16 /* offset */
stxvl 32+v1,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail3):
@@ -206,6 +248,10 @@ L(n_tail3):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,32 /* Offset */
stxvl 32+v2,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_tail1):
@@ -215,6 +261,10 @@ L(tail1):
addi r9,r8,1 /* Add null terminator */
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding_loop)
@@ -229,6 +279,10 @@ L(tail2):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,16 /* offset */
stxvl 32+v1,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding_loop)
@@ -244,6 +298,10 @@ L(tail3):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,32 /* offset */
stxvl 32+v2,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding_loop)
@@ -259,6 +317,10 @@ L(tail4):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,48 /* offset */
stxvl 32+v3,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
@@ -279,3 +341,6 @@ L(zero_padding_end):
blr
END (FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
- rawmemchr-power9 strlen-power9 strncpy-power9
+ rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
@@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, stpncpy,
+ hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ __stpncpy_power9)
+#endif
IFUNC_IMPL_ADD (array, i, stpncpy,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__stpncpy_power8)
new file mode 100644
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for POWER9 LE.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define STPNCPY __stpncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
@@ -26,10 +26,17 @@
extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
+# endif
# undef stpncpy
# undef __stpncpy
libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ ? __stpncpy_power9 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __stpncpy_power8
: (hwcap & PPC_FEATURE_HAS_VSX)