[v2,1/2] powerpc: Add optimized strncpy for POWER9
Commit Message
Changes since v1:
- Fixed comments identation and added some spaces to improve
readbillity.
- Use "POWER 9 LE" instead of "PowerPC64/POWER9".
- Fixed copyright dates.
- Replaced cmpwi for cmpdi.
---8<---
Similar to the strcpy P9 optimization, this version uses VSX to improve
performance.
---
sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 281 ++++++++++++++++++
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
.../powerpc64/multiarch/ifunc-impl-list.c | 5 +
.../powerpc64/multiarch/strncpy-power9.S | 26 ++
sysdeps/powerpc/powerpc64/multiarch/strncpy.c | 7 +
5 files changed, 320 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
Comments
Benchtest output:
generic_strncpy __strncpy_power9
__strncpy_power8 __strncpy_power7 __strncpy_ppc
Length 16, n 16, alignment 1/ 1: 6.44861 2.51617 2.54878 5.94753
9.41467
Length 16, n 16, alignment 1/ 1: 6.4448 2.51688 2.56978 5.86275 9.52956
Length 16, n 16, alignment 1/ 2: 6.51392 2.53026 2.55617 5.96487
9.51182
Length 16, n 16, alignment 2/ 1: 6.5421 2.5026 2.82458 5.95353 9.36524
Length 2, n 4, alignment 7/ 2: 8.02857 2.19272 4.35397 4.97347
8.60923
Length 4, n 2, alignment 2/ 7: 6.04262 1.66226 2.31865 3.27123
6.23803
Length 2, n 4, alignment 7/ 2: 8.15691 2.21924 4.48871 4.97328 8.3591
Length 4, n 2, alignment 2/ 7: 6.0428 1.66435 2.31671 3.2874 6.23902
Length 16, n 16, alignment 2/ 2: 6.75511 2.51667 2.82529 5.65252
9.32002
Length 16, n 16, alignment 2/ 2: 6.53469 2.51982 2.82678 5.93257
9.25613
Length 16, n 16, alignment 2/ 4: 6.3502 2.53333 2.82267 5.66948 9.35942
Length 16, n 16, alignment 4/ 2: 6.71533 2.51217 3.47278 5.95821 8.3249
Length 4, n 8, alignment 6/ 4: 7.85332 2.21708 5.68665 4.83111
9.07271
Length 8, n 4, alignment 4/ 6: 5.93863 1.67938 2.67249 3.07391
7.90751
Length 4, n 8, alignment 6/ 4: 8.24352 2.16644 5.22268 5.04674
9.10352
Length 8, n 4, alignment 4/ 6: 5.88514 1.67966 2.67286 3.29382
7.66757
Length 16, n 16, alignment 3/ 3: 6.55525 2.52511 3.06709 5.95625
9.23173
Length 16, n 16, alignment 3/ 3: 6.66344 2.50855 3.11771 5.96121
8.99767
Length 16, n 16, alignment 3/ 6: 6.82163 2.53355 3.0638 5.96451 9.09031
Length 16, n 16, alignment 6/ 3: 6.35636 2.51634 4.17868 5.95112
7.82576
Length 8, n 16, alignment 5/ 6: 7.46873 2.23953 4.33782 5.76124
10.2851
Length 16, n 8, alignment 6/ 5: 5.63643 1.88233 2.32899 4.72233
5.79268
Length 8, n 16, alignment 5/ 6: 7.47291 2.65201 3.9103 5.40334 10.3902
Length 16, n 8, alignment 6/ 5: 5.73738 1.8787 2.32749 4.69061 6.03053
Length 16, n 16, alignment 4/ 4: 6.63998 2.5166 3.5133 5.83764 8.17814
Length 16, n 16, alignment 4/ 4: 6.6866 2.51915 3.5831 5.96121 8.32436
Length 16, n 16, alignment 4/ 0: 6.58543 2.51529 3.38441 5.96909
8.03797
Length 16, n 16, alignment 0/ 4: 6.6541 1.87852 2.45328 5.96068 7.32961
Length 16, n 32, alignment 4/ 0: 9.37236 3.00744 5.92214 7.25884
11.1515
Length 32, n 16, alignment 0/ 4: 6.2795 1.87939 2.45688 5.96206 7.03327
Length 16, n 32, alignment 4/ 0: 9.24513 3.00344 5.97977 6.94778
11.0213
Length 32, n 16, alignment 0/ 4: 6.45422 1.87851 2.45698 5.96172
7.32939
Length 16, n 16, alignment 5/ 5: 6.53949 2.51619 3.88095 5.96091
9.05987
Length 16, n 16, alignment 5/ 5: 6.47371 2.51703 3.91695 5.96417
9.24674
Length 16, n 16, alignment 5/ 2: 6.5493 2.5163 3.78779 5.95898 9.44104
Length 16, n 16, alignment 2/ 5: 6.70967 2.52226 2.82034 5.96365
9.37646
Length 32, n 64, alignment 3/ 2: 14.0298 3.74521 6.80923 11.2825
12.8659
Length 64, n 32, alignment 2/ 3: 9.53123 2.75624 3.21242 8.51653
12.6887
Length 32, n 64, alignment 3/ 2: 14.179 3.83256 6.56898 11.3584 15.2479
Length 64, n 32, alignment 2/ 3: 9.53184 2.75305 3.21245 8.37087
14.1081
Length 16, n 16, alignment 6/ 6: 6.42159 2.51726 4.38574 5.9562 7.12266
Length 16, n 16, alignment 6/ 6: 6.67028 2.51692 4.2448 5.9544 7.81439
Length 16, n 16, alignment 6/ 4: 6.42402 2.51636 4.23817 5.96162
7.23351
Length 16, n 16, alignment 4/ 6: 6.60107 2.53036 3.54038 5.95837
8.32176
Length 64, n 128, alignment 2/ 4: 15.5573 4.80414 7.45917 11.5659
16.9298
Length 128, n 64, alignment 4/ 2: 11.6195 3.53279 4.80585 10.1583
11.6096
Length 64, n 128, alignment 2/ 4: 15.5233 4.7997 7.34679 11.6628 22.0123
Length 128, n 64, alignment 4/ 2: 11.6078 3.5492 4.77929 10.027 19.504
Length 16, n 16, alignment 7/ 7: 6.54515 2.5141 5.04928 5.95083 7.57587
Length 16, n 16, alignment 7/ 7: 7.00425 2.51299 5.06765 5.92888
8.25286
Length 16, n 16, alignment 7/ 6: 6.62954 2.51922 5.07189 6.02372
7.72968
Length 16, n 16, alignment 6/ 7: 6.34475 2.51841 4.36954 5.95968
7.78498
Length 128, n 256, alignment 1/ 6: 17.9386 7.60767 9.40348 16.5301
20.6134
Length 256, n 128, alignment 6/ 1: 13.373 4.84375 7.34616 12.3919 15.1296
Length 128, n 256, alignment 1/ 6: 17.9186 7.6077 9.37853 16.686 39.2821
Length 256, n 128, alignment 6/ 1: 13.3632 4.91799 8.06183 12.4174
34.1655
Length 8, n 16, alignment 0/ 0: 7.36981 2.22579 4.22739 4.9063 7.24636
Length 32, n 16, alignment 0/ 0: 6.43465 1.87932 2.45308 2.41526 7.1679
Length 8, n 16, alignment 7/ 2: 7.48861 2.21639 3.75708 5.35882
8.45777
Length 32, n 16, alignment 7/ 2: 7.03412 2.3535 5.04692 5.95484 7.25068
Length 16, n 32, alignment 0/ 0: 9.10177 3.06646 4.81682 4.41358
9.89656
Length 64, n 32, alignment 0/ 0: 8.57287 2.53847 2.94869 2.70506 8.2629
Length 16, n 32, alignment 6/ 4: 9.20906 3.04216 6.37553 9.46301
10.2489
Length 64, n 32, alignment 6/ 4: 9.73117 2.75023 4.49311 7.7856 9.59261
Length 32, n 64, alignment 0/ 0: 10.9253 3.80104 4.83111 4.97682
12.1086
Length 128, n 64, alignment 0/ 0: 9.26987 3.15895 3.49112 4.31372
10.1329
Length 32, n 64, alignment 5/ 6: 14.1856 3.78089 7.1768 9.63551 13.9944
Length 128, n 64, alignment 5/ 6: 11.5298 3.5249 5.07847 9.96481 12.8245
Length 64, n 128, alignment 0/ 0: 12.0142 4.73085 5.98759 7.1613 15.0462
Length 256, n 128, alignment 0/ 0: 7.96029 4.50244 6.44433 5.38248
11.6022
Length 64, n 128, alignment 4/ 0: 12.4223 4.80085 7.79294 11.0101
15.5277
Length 256, n 128, alignment 4/ 0: 12.2371 4.79242 6.83902 13.2758
16.0479
Length 128, n 256, alignment 0/ 0: 13.9165 7.28703 8.13319 8.79111
16.9101
Length 512, n 256, alignment 0/ 0: 10.5083 6.49881 9.05173 9.03139
19.6212
Length 128, n 256, alignment 3/ 2: 18.025 7.45493 9.86636 18.7234 20.5106
Length 512, n 256, alignment 3/ 2: 16.9588 7.07807 9.97969 23.4911
25.4407
Length 256, n 512, alignment 0/ 0: 17.6801 12.5811 15.3595 13.9989
28.5549
Length 1024, n 512, alignment 0/ 0: 16.379 10.7794 16.4748 16.7344 37.8286
Length 256, n 512, alignment 2/ 4: 23.2012 13.2761 14.3776 26.3752
31.6336
Length 1024, n 512, alignment 2/ 4: 25.4264 12.1716 17.2608 42.2122 47.425
Length 512, n 1024, alignment 0/ 0: 21.0239 23.0736 19.8285 21.0169
48.0091
Length 2048, n 1024, alignment 0/ 0: 28.424 19.323 36.917 35.4247 68.1661
Length 512, n 1024, alignment 1/ 6: 32.3159 24.2617 21.4919 46.5936 55.163
Length 2048, n 1024, alignment 1/ 6: 43.0359 21.6207 37.7643 77.5705
83.2998
On 9/4/20 1:56 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Changes since v1:
> - Fixed comments identation and added some spaces to improve
> readbillity.
> - Use "POWER 9 LE" instead of "PowerPC64/POWER9".
> - Fixed copyright dates.
> - Replaced cmpwi for cmpdi.
>
> ---8<---
>
> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.
> ---
> sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 281 ++++++++++++++++++
> sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
> .../powerpc64/multiarch/ifunc-impl-list.c | 5 +
> .../powerpc64/multiarch/strncpy-power9.S | 26 ++
> sysdeps/powerpc/powerpc64/multiarch/strncpy.c | 7 +
> 5 files changed, 320 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
>
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..34fcdee913
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,281 @@
> +/* Optimized strncpy implementation for POWER9 LE.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +# ifndef STRNCPY
> +# define FUNC_NAME strncpy
> +# else
> +# define FUNC_NAME STRNCPY
> +# endif
> +
> +/* Implements the function
> +
> + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> + The implementation can load bytes past a null terminator, but only
> + up to the next 16-byte aligned address, so it never crosses a page. */
> +
> +.machine power9
> +ENTRY_TOCLESS (FUNC_NAME, 4)
> + CALL_MCOUNT 2
> +
> + /* NULL string optimizations */
> + cmpdi r5, 0
> + beqlr
> +
> + lbz r0,0(r4)
> + stb r0,0(r3)
> + addi r11,r3,1
> + addi r5,r5,-1
> + vspltisb v18,0 /* Zeroes in v18 */
> + cmpdi r0,0
> + beq L(zero_padding_loop)
> +
> + /* Empty/1-byte string optimization */
> + cmpdi r5,0
> + beqlr
> +
> + addi r4,r4,1
> + neg r7,r4
> + rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */
> +
> + /* Get source 16B aligned */
> + lvx v0,0,r4
> + lvsr v1,0,r4
> + vperm v0,v18,v0,v1
> +
> + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
> + vctzlsbb r7,v6 /* Number of trailing zeroes */
> + addi r8,r7,1 /* Add null terminator */
> +
> + /* r8 = bytes including null
> + r9 = bytes to get source 16B aligned
> + if r8 > r9
> + no null, copy r9 bytes
> + else
> + there is a null, copy r8 bytes and return. */
> + cmpld r8,r9
> + bgt L(no_null)
> +
> + cmpld cr6,r8,r5 /* r8 <= n? */
> + ble cr6,L(null)
> +
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
At first I was confused by this 32+vX syntax. Maybe we could consider
adding defines for VSX registers to sysdeps/powerpc/sysdep.h in the
future? This way we could refer to v0+32 as vs32, for example. But I
don't think this needs to be part of this patchset.
> +
> + blr
> +
> +L(null):
> + sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + add r11,r11,r8
> + sub r5,r5,r8
> + b L(zero_padding_loop)
> +
> +L(no_null):
> + cmpld r9,r5 /* Check if length was reached. */
> + bge L(n_tail1)
> +
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + add r4,r4,r9
> + add r11,r11,r9
> + sub r5,r5,r9
> +
> +L(loop):
> + cmpldi cr6,r5,64 /* Check if length was reached. */
> + ble cr6,L(final_loop)
> +
> + lxv 32+v0,0(r4)
> + vcmpequb. v6,v0,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail1)
> +
> + lxv 32+v1,16(r4)
> + vcmpequb. v6,v1,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail2)
> +
> + lxv 32+v2,32(r4)
> + vcmpequb. v6,v2,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail3)
> +
> + lxv 32+v3,48(r4)
> + vcmpequb. v6,v3,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail4)
> +
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + stxv 32+v3,48(r11)
> +
> + addi r4,r4,64
> + addi r11,r11,64
> + addi r5,r5,-64
> +
> + b L(loop)
> +
> +L(final_loop):
> + cmpldi cr5,r5,16
> + lxv 32+v0,0(r4)
> + vcmpequb. v6,v0,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail1)
> + bne cr6,L(count_tail1)
> + addi r5,r5,-16
> +
> + cmpldi cr5,r5,16
> + lxv 32+v1,16(r4)
> + vcmpequb. v6,v1,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail2)
> + bne cr6,L(count_tail2)
> + addi r5,r5,-16
> +
> + cmpldi cr5,r5,16
> + lxv 32+v2,32(r4)
> + vcmpequb. v6,v2,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail3)
> + bne cr6,L(count_tail3)
> + addi r5,r5,-16
> +
> + lxv 32+v3,48(r4)
> + vcmpequb. v6,v3,v18 /* Any zero bytes? */
> + beq cr6,L(n_tail4)
> +
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpld r8,r5 /* r8 < n? */
> + blt L(tail4)
> +
> +L(n_tail4):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,48 /* Offset */
> + stxvl 32+v3,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_n_tail1):
> + beq cr6,L(n_tail1) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpld r8,r5 /* r8 < n? */
> + blt L(tail1)
> +
> +L(n_tail1):
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_n_tail2):
> + beq cr6,L(n_tail2) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpld r8,r5 /* r8 < n? */
> + blt L(tail2)
> +
> +L(n_tail2):
> + stxv 32+v0,0(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,16 /* offset */
> + stxvl 32+v1,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_n_tail3):
> + beq cr6,L(n_tail3) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpld r8,r5 /* r8 < n? */
> + blt L(tail3)
> +
> +L(n_tail3):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,32 /* Offset */
> + stxvl 32+v2,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_tail1):
> +L(count_tail1):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail1):
> + addi r9,r8,1 /* Add null terminator */
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail2):
> + addi r5,r5,-16
> +L(count_tail2):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail2):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,16 /* offset */
> + stxvl 32+v1,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail3):
> + addi r5,r5,-32
> +L(count_tail3):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail3):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,32 /* offset */
> + stxvl 32+v2,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail4):
> + addi r5,r5,-48
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail4):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,48 /* offset */
> + stxvl 32+v3,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> +
> +/* This code pads the remainder of dest with NULL bytes. */
> +L(zero_padding_loop):
> + cmpldi cr6,r5,16 /* Check if length was reached. */
> + ble cr6,L(zero_padding_end)
> +
> + stxv v18,0(r11)
> + addi r11,r11,16
> + addi r5,r5,-16
> +
> + b L(zero_padding_loop)
> +
> +L(zero_padding_end):
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl v18,r11,r10 /* Partial store */
> + blr
> +
The logic looks good. I tried to find a way to reuse some code, as there
are many similar blocks (e.g. tail* blocks). But their slight
differences make it hard to reuse anything.
> +END (FUNC_NAME)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index 19acb6c64a..cd2b47b403 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
>
> ifneq (,$(filter %le,$(config-machine)))
> sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
> - rawmemchr-power9 strlen-power9
> + rawmemchr-power9 strlen-power9 strncpy-power9
> endif
> CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
> CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index ea10b00417..aa63e1c23f 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -301,6 +301,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */
> IFUNC_IMPL (i, name, strncpy,
> +#ifdef __LITTLE_ENDIAN__
> + IFUNC_IMPL_ADD (array, i, strncpy,
> + hwcap2 & PPC_FEATURE2_ARCH_3_00,
> + __strncpy_power9)
> +#endif
> IFUNC_IMPL_ADD (array, i, strncpy,
> hwcap2 & PPC_FEATURE2_ARCH_2_07,
> __strncpy_power8)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> new file mode 100644
> index 0000000000..ab7c570d54
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> @@ -0,0 +1,26 @@
> +/* Optimized strncpy implementation for POWER9 LE.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
> +#define STRNCPY __strncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
> +#endif
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> index 7bacf28aca..8ef0a99cb5 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> @@ -28,11 +28,18 @@
> extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
> extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
> extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
> +# endif
> # undef strncpy
>
> /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> ifunc symbol properly. */
> libc_ifunc_redirected (__redirect_strncpy, strncpy,
> +# ifdef __LITTLE_ENDIAN__
> + (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> + ? __strncpy_power9 :
> +# endif
> (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> ? __strncpy_power8
> : (hwcap & PPC_FEATURE_HAS_VSX)
>
--
The only thing missing now seems to be the .machine power9 issue that
was pointed out in v1.
Otherwise, LGTM.
Reviewed-by: Matheus Castanho <msc@linux.ibm.com>
--
Matheus Castanho
new file mode 100644
@@ -0,0 +1,281 @@
+/* Optimized strncpy implementation for POWER9 LE.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+# ifndef STRNCPY
+# define FUNC_NAME strncpy
+# else
+# define FUNC_NAME STRNCPY
+# endif
+
+/* Implements the function
+
+ char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ The implementation can load bytes past a null terminator, but only
+ up to the next 16-byte aligned address, so it never crosses a page. */
+
+.machine power9
+ENTRY_TOCLESS (FUNC_NAME, 4)
+ CALL_MCOUNT 2
+
+ /* NULL string optimizations */
+ cmpdi r5, 0
+ beqlr
+
+ lbz r0,0(r4)
+ stb r0,0(r3)
+ addi r11,r3,1
+ addi r5,r5,-1
+ vspltisb v18,0 /* Zeroes in v18 */
+ cmpdi r0,0
+ beq L(zero_padding_loop)
+
+ /* Empty/1-byte string optimization */
+ cmpdi r5,0
+ beqlr
+
+ addi r4,r4,1
+ neg r7,r4
+ rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */
+
+ /* Get source 16B aligned */
+ lvx v0,0,r4
+ lvsr v1,0,r4
+ vperm v0,v18,v0,v1
+
+ vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
+ vctzlsbb r7,v6 /* Number of trailing zeroes */
+ addi r8,r7,1 /* Add null terminator */
+
+ /* r8 = bytes including null
+ r9 = bytes to get source 16B aligned
+ if r8 > r9
+ no null, copy r9 bytes
+ else
+ there is a null, copy r8 bytes and return. */
+ cmpld r8,r9
+ bgt L(no_null)
+
+ cmpld cr6,r8,r5 /* r8 <= n? */
+ ble cr6,L(null)
+
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ blr
+
+L(null):
+ sldi r10,r8,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ add r11,r11,r8
+ sub r5,r5,r8
+ b L(zero_padding_loop)
+
+L(no_null):
+ cmpld r9,r5 /* Check if length was reached. */
+ bge L(n_tail1)
+
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ add r4,r4,r9
+ add r11,r11,r9
+ sub r5,r5,r9
+
+L(loop):
+ cmpldi cr6,r5,64 /* Check if length was reached. */
+ ble cr6,L(final_loop)
+
+ lxv 32+v0,0(r4)
+ vcmpequb. v6,v0,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail1)
+
+ lxv 32+v1,16(r4)
+ vcmpequb. v6,v1,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail2)
+
+ lxv 32+v2,32(r4)
+ vcmpequb. v6,v2,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail3)
+
+ lxv 32+v3,48(r4)
+ vcmpequb. v6,v3,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail4)
+
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+
+ addi r4,r4,64
+ addi r11,r11,64
+ addi r5,r5,-64
+
+ b L(loop)
+
+L(final_loop):
+ cmpldi cr5,r5,16
+ lxv 32+v0,0(r4)
+ vcmpequb. v6,v0,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail1)
+ bne cr6,L(count_tail1)
+ addi r5,r5,-16
+
+ cmpldi cr5,r5,16
+ lxv 32+v1,16(r4)
+ vcmpequb. v6,v1,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail2)
+ bne cr6,L(count_tail2)
+ addi r5,r5,-16
+
+ cmpldi cr5,r5,16
+ lxv 32+v2,32(r4)
+ vcmpequb. v6,v2,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail3)
+ bne cr6,L(count_tail3)
+ addi r5,r5,-16
+
+ lxv 32+v3,48(r4)
+ vcmpequb. v6,v3,v18 /* Any zero bytes? */
+ beq cr6,L(n_tail4)
+
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpld r8,r5 /* r8 < n? */
+ blt L(tail4)
+
+L(n_tail4):
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,48 /* Offset */
+ stxvl 32+v3,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail1):
+ beq cr6,L(n_tail1) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpld r8,r5 /* r8 < n? */
+ blt L(tail1)
+
+L(n_tail1):
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail2):
+ beq cr6,L(n_tail2) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpld r8,r5 /* r8 < n? */
+ blt L(tail2)
+
+L(n_tail2):
+ stxv 32+v0,0(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,16 /* offset */
+ stxvl 32+v1,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail3):
+ beq cr6,L(n_tail3) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpld r8,r5 /* r8 < n? */
+ blt L(tail3)
+
+L(n_tail3):
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,32 /* Offset */
+ stxvl 32+v2,r11,r10 /* Partial store */
+ blr
+
+L(prep_tail1):
+L(count_tail1):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail1):
+ addi r9,r8,1 /* Add null terminator */
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding_loop)
+
+L(prep_tail2):
+ addi r5,r5,-16
+L(count_tail2):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail2):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,16 /* offset */
+ stxvl 32+v1,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding_loop)
+
+L(prep_tail3):
+ addi r5,r5,-32
+L(count_tail3):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail3):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,32 /* offset */
+ stxvl 32+v2,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding_loop)
+
+L(prep_tail4):
+ addi r5,r5,-48
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail4):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,48 /* offset */
+ stxvl 32+v3,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+
+/* This code pads the remainder of dest with NULL bytes. */
+L(zero_padding_loop):
+ cmpldi cr6,r5,16 /* Check if length was reached. */
+ ble cr6,L(zero_padding_end)
+
+ stxv v18,0(r11)
+ addi r11,r11,16
+ addi r5,r5,-16
+
+ b L(zero_padding_loop)
+
+L(zero_padding_end):
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl v18,r11,r10 /* Partial store */
+ blr
+
+END (FUNC_NAME)
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
- rawmemchr-power9 strlen-power9
+ rawmemchr-power9 strlen-power9 strncpy-power9
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
@@ -301,6 +301,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */
IFUNC_IMPL (i, name, strncpy,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, strncpy,
+ hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ __strncpy_power9)
+#endif
IFUNC_IMPL_ADD (array, i, strncpy,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strncpy_power8)
new file mode 100644
@@ -0,0 +1,26 @@
+/* Optimized strncpy implementation for POWER9 LE.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRNCPY __strncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+#endif
@@ -28,11 +28,18 @@
extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
+# endif
# undef strncpy
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc_redirected (__redirect_strncpy, strncpy,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ ? __strncpy_power9 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strncpy_power8
: (hwcap & PPC_FEATURE_HAS_VSX)