[1/2] powerpc: Optimized strncpy for POWER9
Commit Message
Similar to the strcpy P9 optimization, this version uses VSX to improve
performance.
---
sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 276 ++++++++++++++++++
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
.../powerpc64/multiarch/ifunc-impl-list.c | 5 +
.../powerpc64/multiarch/strncpy-power9.S | 26 ++
sysdeps/powerpc/powerpc64/multiarch/strncpy.c | 7 +
5 files changed, 315 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
Comments
Here is the make bench output:
generic_strncpy __strncpy_power9
__strncpy_power8 __strncpy_power7 __strncpy_ppc
Length 16, n 16, alignment 1/ 1: 7.11694 2.77348 2.80296 6.5724 10.4471
Length 16, n 16, alignment 1/ 1: 7.1557 2.75968 2.805 6.5748 10.5064
Length 16, n 16, alignment 1/ 2: 7.17956 2.79127 2.79964 6.57323
10.3281
Length 16, n 16, alignment 2/ 1: 7.15841 2.77364 3.10582 6.2332 10.331
Length 2, n 4, alignment 7/ 2: 8.90911 2.4623 5.38449 5.64873 9.36348
Length 4, n 2, alignment 2/ 7: 6.65395 1.84558 2.58298 3.10566
7.46376
Length 2, n 4, alignment 7/ 2: 8.70625 2.41166 5.38131 5.73421
9.64285
Length 4, n 2, alignment 2/ 7: 6.65458 1.84354 2.58382 3.64721
6.96163
Length 16, n 16, alignment 2/ 2: 7.01778 2.77373 3.10668 6.58047
10.6006
Length 16, n 16, alignment 2/ 2: 7.53778 2.75789 3.10591 6.2277 10.2613
Length 16, n 16, alignment 2/ 4: 7.13828 2.79132 3.10567 6.56847 10.619
Length 16, n 16, alignment 4/ 2: 7.38659 2.77668 3.70851 6.54537
9.17368
Length 4, n 8, alignment 6/ 4: 8.71748 2.45183 5.76669 4.65782 10.014
Length 8, n 4, alignment 4/ 6: 6.5504 1.83463 2.96574 2.66227 8.49964
Length 4, n 8, alignment 6/ 4: 8.96461 2.4499 5.78384 5.32287 9.79641
Length 8, n 4, alignment 4/ 6: 6.48083 1.83265 2.9783 3.38632 8.51888
Length 16, n 16, alignment 3/ 3: 7.7538 2.77353 3.29008 6.55912 9.94143
Length 16, n 16, alignment 3/ 3: 7.75279 2.76148 3.30616 6.5445 9.98866
Length 16, n 16, alignment 3/ 6: 7.21486 2.79444 3.33712 6.24747 10.113
Length 16, n 16, alignment 6/ 3: 6.99138 2.77778 4.50777 6.22522
8.53482
Length 8, n 16, alignment 5/ 6: 8.26994 2.77966 4.60681 6.10938
10.5975
Length 16, n 8, alignment 6/ 5: 6.28062 2.07193 2.57761 4.95636
6.48035
Length 8, n 16, alignment 5/ 6: 8.17113 2.43559 4.27753 5.95453
11.1796
Length 16, n 8, alignment 6/ 5: 6.21214 2.07239 2.57714 4.96762
6.76041
Length 16, n 16, alignment 4/ 4: 7.31373 2.77573 3.78349 6.19349
8.91432
Length 16, n 16, alignment 4/ 4: 7.32226 2.75658 3.70319 6.60792
9.17307
Length 16, n 16, alignment 4/ 0: 7.58812 2.76841 3.71554 6.54282
8.90051
Length 16, n 16, alignment 0/ 4: 6.92871 2.06944 2.66876 6.63947
8.08171
Length 16, n 32, alignment 4/ 0: 10.2972 3.3192 6.53695 7.77295 12.332
Length 32, n 16, alignment 0/ 4: 6.98056 2.06954 2.66445 6.54976
7.65286
Length 16, n 32, alignment 4/ 0: 10.5356 3.31343 6.53813 7.72029
12.2915
Length 32, n 16, alignment 0/ 4: 7.36068 2.06945 2.66424 6.21052
8.07614
Length 16, n 16, alignment 5/ 5: 7.2122 2.77732 4.17451 6.55383 10.4887
Length 16, n 16, alignment 5/ 5: 7.34438 2.77512 4.17191 6.56873
10.5664
Length 16, n 16, alignment 5/ 2: 7.15746 2.76198 4.14481 6.56235
10.7391
Length 16, n 16, alignment 2/ 5: 7.19372 2.79273 3.10693 6.56984
10.2697
Length 32, n 64, alignment 3/ 2: 15.3918 4.22964 7.0146 12.5809 13.8661
Length 64, n 32, alignment 2/ 3: 10.5331 3.02942 3.54253 9.19106
12.9356
Length 32, n 64, alignment 3/ 2: 15.369 4.17282 7.36163 12.5759 16.8501
Length 64, n 32, alignment 2/ 3: 10.5585 3.01971 3.52885 9.03369
15.6663
Length 16, n 16, alignment 6/ 6: 7.0405 2.77527 4.53842 6.54733 7.99437
Length 16, n 16, alignment 6/ 6: 7.02801 2.76059 4.52873 6.53536
8.45713
Length 16, n 16, alignment 6/ 4: 7.42011 2.77669 4.52223 6.57756 7.9899
Length 16, n 16, alignment 4/ 6: 7.37787 2.77507 3.77821 6.57058
9.17396
Length 64, n 128, alignment 2/ 4: 17.188 5.33493 8.00394 12.6196 19.1784
Length 128, n 64, alignment 4/ 2: 12.7962 3.91004 5.42994 11.294 12.5273
Length 64, n 128, alignment 2/ 4: 17.2298 5.2748 8.15392 12.6039 24.3802
Length 128, n 64, alignment 4/ 2: 12.7866 3.87534 5.3334 11.8516 21.6528
Length 16, n 16, alignment 7/ 7: 7.75015 2.76775 5.59024 6.57976
8.42318
Length 16, n 16, alignment 7/ 7: 7.81681 2.75691 5.56801 6.55397
10.0378
Length 16, n 16, alignment 7/ 6: 7.75225 2.77446 5.56813 6.57349
8.49645
Length 16, n 16, alignment 6/ 7: 7.23237 2.79186 4.51528 6.55304
8.63443
Length 128, n 256, alignment 1/ 6: 19.8414 8.37691 10.3445 18.4838
22.8314
Length 256, n 128, alignment 6/ 1: 14.7972 5.38498 8.83611 13.8521
16.6154
Length 128, n 256, alignment 1/ 6: 19.8497 8.37754 10.3469 18.2655
43.3568
Length 256, n 128, alignment 6/ 1: 14.7542 5.31075 8.75314 13.7759
37.6351
Length 8, n 16, alignment 0/ 0: 8.19872 2.45818 4.27602 4.6578 7.98513
Length 32, n 16, alignment 0/ 0: 6.92066 2.07115 2.66465 2.66381
7.75655
Length 8, n 16, alignment 7/ 2: 8.18253 2.42685 4.70317 6.01808
9.35743
Length 32, n 16, alignment 7/ 2: 7.79714 2.60074 5.58717 6.64181
6.98583
Length 16, n 32, alignment 0/ 0: 10.4715 3.40184 6.28388 4.86146
11.0819
Length 64, n 32, alignment 0/ 0: 10.4403 2.54135 3.07109 3.38791
9.35196
Length 16, n 32, alignment 6/ 4: 10.7077 3.34867 7.01321 10.4278
11.2951
Length 64, n 32, alignment 6/ 4: 10.9215 3.03041 5.04324 8.30023
11.2648
Length 32, n 64, alignment 0/ 0: 12.0062 4.09428 5.32372 5.48319
14.1455
Length 128, n 64, alignment 0/ 0: 10.1803 3.47282 3.83134 4.21557
10.6674
Length 32, n 64, alignment 5/ 6: 15.4165 4.16297 7.78876 10.8762
15.4308
Length 128, n 64, alignment 5/ 6: 12.7332 3.91667 5.8014 10.5869 14.0961
Length 64, n 128, alignment 0/ 0: 13.238 5.24242 6.90661 8.05566 15.9848
Length 256, n 128, alignment 0/ 0: 8.759 4.9483 6.98675 6.11489 12.6755
Length 64, n 128, alignment 4/ 0: 13.6593 5.27931 8.60925 12.5916
17.5016
Length 256, n 128, alignment 4/ 0: 13.4801 5.37114 7.47485 14.0585
17.4517
Length 128, n 256, alignment 0/ 0: 15.3147 8.02462 8.92006 9.67769
20.3757
Length 512, n 256, alignment 0/ 0: 11.5638 7.22535 9.80468 9.93597
21.3421
Length 128, n 256, alignment 3/ 2: 19.8948 8.15967 10.9435 20.6146
22.4146
Length 512, n 256, alignment 3/ 2: 18.681 7.77864 10.9269 25.9269 28.0105
Length 256, n 512, alignment 0/ 0: 19.4894 13.7363 14.8394 15.4064
31.6341
Length 1024, n 512, alignment 0/ 0: 18.0108 11.8737 18.1779 18.5072
41.5425
Length 256, n 512, alignment 2/ 4: 25.5662 14.5189 16.1872 29.5395
33.7587
Length 1024, n 512, alignment 2/ 4: 28.0079 13.2347 19.067 48.1998 52.3078
Length 512, n 1024, alignment 0/ 0: 23.1385 25.4237 21.2303 23.632 47.4502
Length 2048, n 1024, alignment 0/ 0: 31.201 21.308 40.6351 39.04 75.0329
Length 512, n 1024, alignment 1/ 6: 35.6234 27.0042 24.4711 51.3364
60.6277
Length 2048, n 1024, alignment 1/ 6: 47.442 24.0381 41.6616 85.4832 91.8897
On 8/20/20 1:29 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.
> ---
> sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 276 ++++++++++++++++++
> sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
> .../powerpc64/multiarch/ifunc-impl-list.c | 5 +
> .../powerpc64/multiarch/strncpy-power9.S | 26 ++
> sysdeps/powerpc/powerpc64/multiarch/strncpy.c | 7 +
> 5 files changed, 315 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
>
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..cde68384d4
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,276 @@
> +/* Optimized strncpy implementation for PowerPC64/POWER9.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +# ifndef STRNCPY
> +# define FUNC_NAME strncpy
> +# else
> +# define FUNC_NAME STRNCPY
> +# endif
> +
> +/* Implements the function
> +
> + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> + The implementation can load bytes past a null terminator, but only
> + up to the next 16B boundary, so it never crosses a page. */
> +
> +.machine power9
> +ENTRY_TOCLESS (FUNC_NAME, 4)
> + CALL_MCOUNT 2
> +
> + cmpwi r5, 0
> + beqlr
Trivial nit, an newline after branches helps readability for me.
> + /* NULL string optimisation */
> + lbz r0,0(r4)
> + stb r0,0(r3)
> + addi r11,r3,1
> + addi r5,r5,-1
> + vspltisb v18,0 /* Zeroes in v18 */
> + cmpwi r0,0
> + beq L(zero_padding_loop) > +
> + cmpwi r5,0
> + beqlr
OK.
> +
> +L(cont):
I think this label can be removed or replaced with a comment.
> + addi r4,r4,1
> + neg r7,r4
> + rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */
> +
> + /* Get source 16B aligned */
> + lvx v0,0,r4
> + lvsr v1,0,r4
> + vperm v0,v18,v0,v1
> +
> + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
> + vctzlsbb r7,v6 /* Number of trailing zeroes */
> + addi r8,r7,1 /* Add null terminator */
Minor nit, can you align the comment with previous comments?
> +
> + /* r8 = bytes including null
> + r9 = bytes to get source 16B aligned
> + if r8 > r9
> + no null, copy r9 bytes
> + else
> + there is a null, copy r8 bytes and return. */
> + cmpd r8,r9
> + bgt L(no_null)
> +
> + cmpd r8,r5 /* r8 <= n? */
Minor, you could use another CR and run this in parallel with the
previous check.
> + ble L(null)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + blr
OK.
> +
> +L(null):
> + sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + add r11,r11,r8
> + sub r5,r5,r8
> + b L(zero_padding_loop)
OK.
> +
> +L(no_null):
> + cmpd r9,r5 /* Check if length was reached. */
> + bge L(n_tail1)
An extra newline would help here.
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + add r4,r4,r9
> + add r11,r11,r9
> + sub r5,r5,r9
OK.
> +
> +L(loop):
> + cmpldi cr6,r5,64 /* Check if length was reached. */
> + ble cr6,L(final_loop)
> +
> + lxv 32+v0,0(r4)
> + vcmpequb. v6,v0,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail1)
> +
> + lxv 32+v1,16(r4)
> + vcmpequb. v6,v1,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail2)
> +
> + lxv 32+v2,32(r4)
> + vcmpequb. v6,v2,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail3)
> +
> + lxv 32+v3,48(r4)
> + vcmpequb. v6,v3,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail4)
> +
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + stxv 32+v3,48(r11)
> +
> + addi r4,r4,64
> + addi r11,r11,64
> + addi r5,r5,-64
> +
> + b L(loop)
OK.
> +
> +L(final_loop):
> + cmpldi cr5,r5,16
> + lxv 32+v0,0(r4)
> + vcmpequb. v6,v0,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail1)
> + bne cr6,L(count_tail1)
> + addi r5,r5,-16
> +
> + cmpldi cr5,r5,16
> + lxv 32+v1,16(r4)
> + vcmpequb. v6,v1,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail2)
> + bne cr6,L(count_tail2)
> + addi r5,r5,-16
> +
> + cmpldi cr5,r5,16
> + lxv 32+v2,32(r4)
> + vcmpequb. v6,v2,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail3)
> + bne cr6,L(count_tail3)
> + addi r5,r5,-16
OK.
> +
> + lxv 32+v3,48(r4)
> + vcmpequb. v6,v3,v18 /* Any zero bytes? */
> + beq cr6,L(n_tail4)
> +
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
> + blt L(tail4)
OK. Newline here (and for the other similar cases below too please).
> +L(n_tail4):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,48 /* Offset */
> + stxvl 32+v3,r11,r10 /* Partial store */
> + blr
OK.
> +
> +L(prep_n_tail1):
> + beq cr6,L(n_tail1) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
> + blt L(tail1)
> +L(n_tail1):
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> + blr
OK.
> +
> +L(prep_n_tail2):
> + beq cr6,L(n_tail2) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
> + blt L(tail2)
> +L(n_tail2):
> + stxv 32+v0,0(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,16 /* offset */
> + stxvl 32+v1,r11,r10 /* Partial store */
> + blr
OK.
> +
> +L(prep_n_tail3):
> + beq cr6,L(n_tail3) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
> + blt L(tail3)
> +L(n_tail3):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,32 /* Offset */
> + stxvl 32+v2,r11,r10 /* Partial store */
> + blr
OK.
> +
> +L(prep_tail1):
> +L(count_tail1):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail1):
> + addi r9,r8,1 /* Add null terminator */
Please align this comment (and the 3 other similar cases).
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
OK.
> +
> +L(prep_tail2):
> + addi r5,r5,-16
> +L(count_tail2):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail2):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,16 /* offset */
> + stxvl 32+v1,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail3):
> + addi r5,r5,-32
> +L(count_tail3):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail3):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,32 /* offset */
> + stxvl 32+v2,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail4):
> + addi r5,r5,-48
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail4):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,48 /* offset */
> + stxvl 32+v3,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
OK.
> +
> +/* This code pads the remainder of dest with NULL bytes. */
> +L(zero_padding_loop):
> + cmpldi cr6,r5,16 /* Check if length was reached. */
> + ble cr6,L(zero_padding_end)
> +
> + stxv v18,0(r11)
> + addi r11,r11,16
> + addi r5,r5,-16
> +
> + b L(zero_padding_loop)
> +
OK.
> +L(zero_padding_end):
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl v18,r11,r10 /* Partial store */
> + blr
OK.
> +
> +L(n_tail):Is this label used?
> +
> +END (FUNC_NAME)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index 19acb6c64a..cd2b47b403 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
OK.
> index ea10b00417..aa63e1c23f 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
OK.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> new file mode 100644
> index 0000000000..b9b6092f7b
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
OK.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> index 7bacf28aca..822ceb2003 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> @@ -28,11 +28,18 @@
> extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
> extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
> extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
> +# endif
> # undef strncpy
>
> /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> ifunc symbol properly. */
> libc_ifunc_redirected (__redirect_strncpy, strncpy,
> +# ifdef __LITTLE_ENDIAN__
> + (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> + ? __strncpy_power9 :
Trivial nit, I think the above two lines need two extra spaces.
On Thu, Aug 20, 2020 at 03:29:16PM -0300, Raphael Moreira Zinsly via Libc-alpha wrote:
> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.
> ---
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..cde68384d4
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,276 @@
> +/* Optimized strncpy implementation for PowerPC64/POWER9.
sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S below, has
"POWER9/PPC64". Can we make these consistent? Can we just say
"POWER9"? Do we need to indicate little-endian only?
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +# ifndef STRNCPY
> +# define FUNC_NAME strncpy
> +# else
> +# define FUNC_NAME STRNCPY
> +# endif
> +
> +/* Implements the function
> +
> + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> + The implementation can load bytes past a null terminator, but only
> + up to the next 16B boundary, so it never crosses a page. */
nit, subjective: "up to the next 16-byte aligned address"
> +
> +.machine power9
> +ENTRY_TOCLESS (FUNC_NAME, 4)
> + CALL_MCOUNT 2
> +
> + cmpwi r5, 0
This should be "cmpdi".
> + beqlr
> + /* NULL string optimisation */
This comment would make more sense above the "cmpdi", above.
> + lbz r0,0(r4)
> + stb r0,0(r3)
> + addi r11,r3,1
> + addi r5,r5,-1
> + vspltisb v18,0 /* Zeroes in v18 */
> + cmpwi r0,0
This should be "cmpdi".
> + beq L(zero_padding_loop)
> +
Given the above "NULL string" comment, you could
put an "empty string optimization" comment here.
> + cmpwi r5,0
This should be "cmpdi".
> + beqlr
The "addi r11,r3,1" and "vspltisb v18,0" above aren't needed until
a bit later, which penalizes the empty string case. I think you
can move the empty string test up. Some experiments seemed to move
the lbz and dependent stb apart. Something like this:
/* NULL string optimisation */
cmpdi r5,0
beqlr
lbz r0,0(r4)
/* empty/1-byte string optimisation */
cmpdi r5,1
stb r0,0(r3)
beqlr
cmpdi r0,0
addi r11,r3,1
addi r5,r5,-1
vspltisb v18,0 /* Zeroes in v18 */
beq L(zero_padding_loop)
(But, I didn't see significant performance difference in
some light experimentation. It might be worth another look.)
> +
> +L(cont):
This label isn't used.
> + addi r4,r4,1
> + neg r7,r4
> + rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */
> +
> + /* Get source 16B aligned */
> + lvx v0,0,r4
> + lvsr v1,0,r4
> + vperm v0,v18,v0,v1
> +
> + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
> + vctzlsbb r7,v6 /* Number of trailing zeroes */
> + addi r8,r7,1 /* Add null terminator */
> +
> + /* r8 = bytes including null
> + r9 = bytes to get source 16B aligned
> + if r8 > r9
> + no null, copy r9 bytes
> + else
> + there is a null, copy r8 bytes and return. */
> + cmpd r8,r9
This should probably be "cmpld".
> + bgt L(no_null)
> +
> + cmpd r8,r5 /* r8 <= n? */
This should probably be "cmpld".
> + ble L(null)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
Do we still need this "32+v0" syntax? Is that due to a minimum supported
level of binutils which isn't VSX-aware?
> +
> + blr
> +
> +L(null):
> + sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + add r11,r11,r8
> + sub r5,r5,r8
> + b L(zero_padding_loop)
> +
> +L(no_null):
> + cmpd r9,r5 /* Check if length was reached. */
This should probably be "cmpld".
> + bge L(n_tail1)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + add r4,r4,r9
> + add r11,r11,r9
> + sub r5,r5,r9
> +
> +L(loop):
> + cmpldi cr6,r5,64 /* Check if length was reached. */
> + ble cr6,L(final_loop)
> +
> + lxv 32+v0,0(r4)
> + vcmpequb. v6,v0,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail1)
> +
> + lxv 32+v1,16(r4)
> + vcmpequb. v6,v1,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail2)
> +
> + lxv 32+v2,32(r4)
> + vcmpequb. v6,v2,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail3)
> +
> + lxv 32+v3,48(r4)
> + vcmpequb. v6,v3,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail4)
> +
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + stxv 32+v3,48(r11)
> +
> + addi r4,r4,64
> + addi r11,r11,64
> + addi r5,r5,-64
> +
> + b L(loop)
> +
> +L(final_loop):
> + cmpldi cr5,r5,16
> + lxv 32+v0,0(r4)
> + vcmpequb. v6,v0,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail1)
> + bne cr6,L(count_tail1)
> + addi r5,r5,-16
> +
> + cmpldi cr5,r5,16
> + lxv 32+v1,16(r4)
> + vcmpequb. v6,v1,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail2)
> + bne cr6,L(count_tail2)
> + addi r5,r5,-16
> +
> + cmpldi cr5,r5,16
> + lxv 32+v2,32(r4)
> + vcmpequb. v6,v2,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail3)
> + bne cr6,L(count_tail3)
> + addi r5,r5,-16
> +
> + lxv 32+v3,48(r4)
> + vcmpequb. v6,v3,v18 /* Any zero bytes? */
> + beq cr6,L(n_tail4)
> +
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
This should probably be "cmpld".
> + blt L(tail4)
> +L(n_tail4):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,48 /* Offset */
> + stxvl 32+v3,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_n_tail1):
> + beq cr6,L(n_tail1) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
This should probably be "cmpld".
> + blt L(tail1)
> +L(n_tail1):
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_n_tail2):
> + beq cr6,L(n_tail2) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
This should probably be "cmpld".
> + blt L(tail2)
> +L(n_tail2):
> + stxv 32+v0,0(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,16 /* offset */
> + stxvl 32+v1,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_n_tail3):
> + beq cr6,L(n_tail3) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
This should probably be "cmpld".
> + blt L(tail3)
> +L(n_tail3):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,32 /* Offset */
> + stxvl 32+v2,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_tail1):
> +L(count_tail1):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail1):
> + addi r9,r8,1 /* Add null terminator */
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail2):
> + addi r5,r5,-16
> +L(count_tail2):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail2):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,16 /* offset */
> + stxvl 32+v1,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail3):
> + addi r5,r5,-32
> +L(count_tail3):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail3):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,32 /* offset */
> + stxvl 32+v2,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail4):
> + addi r5,r5,-48
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail4):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,48 /* offset */
> + stxvl 32+v3,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> +
> +/* This code pads the remainder of dest with NULL bytes. */
> +L(zero_padding_loop):
> + cmpldi cr6,r5,16 /* Check if length was reached. */
> + ble cr6,L(zero_padding_end)
> +
> + stxv v18,0(r11)
> + addi r11,r11,16
> + addi r5,r5,-16
> +
> + b L(zero_padding_loop)
> +
> +L(zero_padding_end):
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl v18,r11,r10 /* Partial store */
> + blr
> +
> +L(n_tail):
> +
> +END (FUNC_NAME)
PC
Raphael Moreira Zinsly via Libc-alpha <libc-alpha@sourceware.org> writes:
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..cde68384d4
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,276 @@
> ...
> +/* Implements the function
> +
> + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> + The implementation can load bytes past a null terminator, but only
> + up to the next 16B boundary, so it never crosses a page. */
> +
> +.machine power9
I don't think Binutils 2.26 supports .machine power9. Likewise for all P9
instructions. However, current glibc is expected to work with Binutils 2.26
(ppc64le), i.e. builds with Binutils 2.26 should not fail.
So, we either need to change this code (e.g. similar to strcmp) or we need
to bump the Binutils requirements.
The last time Binutils requirements was bumped was in 2017, so I think it's safe
to do this now.
Let me prepare a patch proposing this.
On 9/2/20 8:20 AM, Tulio Magno Quites Machado Filho wrote:
> Raphael Moreira Zinsly via Libc-alpha <libc-alpha@sourceware.org> writes:
>
>> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
>> new file mode 100644
>> index 0000000000..cde68384d4
>> --- /dev/null
>> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
>> @@ -0,0 +1,276 @@
>> ...
>> +/* Implements the function
>> +
>> + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
>> +
>> + The implementation can load bytes past a null terminator, but only
>> + up to the next 16B boundary, so it never crosses a page. */
>> +
>> +.machine power9
>
> I don't think Binutils 2.26 supports .machine power9. Likewise for all P9
> instructions. However, current glibc is expected to work with Binutils 2.26
> (ppc64le), i.e. builds with Binutils 2.26 should not fail.
>
> So, we either need to change this code (e.g. similar to strcmp) or we need
> to bump the Binutils requirements.
> The last time Binutils requirements was bumped was in 2017, so I think it's safe
> to do this now.
>
> Let me prepare a patch proposing this.
There are at least 5 uses of .machine power9 throughout glibc today. I
agree with bumping at least the ppc64le requirements to match.
new file mode 100644
@@ -0,0 +1,276 @@
+/* Optimized strncpy implementation for PowerPC64/POWER9.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+# ifndef STRNCPY
+# define FUNC_NAME strncpy
+# else
+# define FUNC_NAME STRNCPY
+# endif
+
+/* Implements the function
+
+ char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ The implementation can load bytes past a null terminator, but only
+ up to the next 16B boundary, so it never crosses a page. */
+
+.machine power9
+ENTRY_TOCLESS (FUNC_NAME, 4)
+ CALL_MCOUNT 2
+
+ cmpwi r5, 0
+ beqlr
+ /* NULL string optimisation */
+ lbz r0,0(r4)
+ stb r0,0(r3)
+ addi r11,r3,1
+ addi r5,r5,-1
+ vspltisb v18,0 /* Zeroes in v18 */
+ cmpwi r0,0
+ beq L(zero_padding_loop)
+
+ cmpwi r5,0
+ beqlr
+
+L(cont):
+ addi r4,r4,1
+ neg r7,r4
+ rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */
+
+ /* Get source 16B aligned */
+ lvx v0,0,r4
+ lvsr v1,0,r4
+ vperm v0,v18,v0,v1
+
+ vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
+ vctzlsbb r7,v6 /* Number of trailing zeroes */
+ addi r8,r7,1 /* Add null terminator */
+
+ /* r8 = bytes including null
+ r9 = bytes to get source 16B aligned
+ if r8 > r9
+ no null, copy r9 bytes
+ else
+ there is a null, copy r8 bytes and return. */
+ cmpd r8,r9
+ bgt L(no_null)
+
+ cmpd r8,r5 /* r8 <= n? */
+ ble L(null)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ blr
+
+L(null):
+ sldi r10,r8,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ add r11,r11,r8
+ sub r5,r5,r8
+ b L(zero_padding_loop)
+
+L(no_null):
+ cmpd r9,r5 /* Check if length was reached. */
+ bge L(n_tail1)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ add r4,r4,r9
+ add r11,r11,r9
+ sub r5,r5,r9
+
+L(loop):
+ cmpldi cr6,r5,64 /* Check if length was reached. */
+ ble cr6,L(final_loop)
+
+ lxv 32+v0,0(r4)
+ vcmpequb. v6,v0,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail1)
+
+ lxv 32+v1,16(r4)
+ vcmpequb. v6,v1,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail2)
+
+ lxv 32+v2,32(r4)
+ vcmpequb. v6,v2,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail3)
+
+ lxv 32+v3,48(r4)
+ vcmpequb. v6,v3,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail4)
+
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+
+ addi r4,r4,64
+ addi r11,r11,64
+ addi r5,r5,-64
+
+ b L(loop)
+
+L(final_loop):
+ cmpldi cr5,r5,16
+ lxv 32+v0,0(r4)
+ vcmpequb. v6,v0,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail1)
+ bne cr6,L(count_tail1)
+ addi r5,r5,-16
+
+ cmpldi cr5,r5,16
+ lxv 32+v1,16(r4)
+ vcmpequb. v6,v1,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail2)
+ bne cr6,L(count_tail2)
+ addi r5,r5,-16
+
+ cmpldi cr5,r5,16
+ lxv 32+v2,32(r4)
+ vcmpequb. v6,v2,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail3)
+ bne cr6,L(count_tail3)
+ addi r5,r5,-16
+
+ lxv 32+v3,48(r4)
+ vcmpequb. v6,v3,v18 /* Any zero bytes? */
+ beq cr6,L(n_tail4)
+
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpd r8,r5 /* r8 < n? */
+ blt L(tail4)
+L(n_tail4):
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,48 /* Offset */
+ stxvl 32+v3,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail1):
+ beq cr6,L(n_tail1) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpd r8,r5 /* r8 < n? */
+ blt L(tail1)
+L(n_tail1):
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail2):
+ beq cr6,L(n_tail2) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpd r8,r5 /* r8 < n? */
+ blt L(tail2)
+L(n_tail2):
+ stxv 32+v0,0(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,16 /* offset */
+ stxvl 32+v1,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail3):
+ beq cr6,L(n_tail3) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpd r8,r5 /* r8 < n? */
+ blt L(tail3)
+L(n_tail3):
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,32 /* Offset */
+ stxvl 32+v2,r11,r10 /* Partial store */
+ blr
+
+L(prep_tail1):
+L(count_tail1):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail1):
+ addi r9,r8,1 /* Add null terminator */
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding_loop)
+
+L(prep_tail2):
+ addi r5,r5,-16
+L(count_tail2):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail2):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,16 /* offset */
+ stxvl 32+v1,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding_loop)
+
+L(prep_tail3):
+ addi r5,r5,-32
+L(count_tail3):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail3):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,32 /* offset */
+ stxvl 32+v2,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding_loop)
+
+L(prep_tail4):
+ addi r5,r5,-48
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail4):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,48 /* offset */
+ stxvl 32+v3,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+
+/* This code pads the remainder of dest with NULL bytes. */
+L(zero_padding_loop):
+ cmpldi cr6,r5,16 /* Check if length was reached. */
+ ble cr6,L(zero_padding_end)
+
+ stxv v18,0(r11)
+ addi r11,r11,16
+ addi r5,r5,-16
+
+ b L(zero_padding_loop)
+
+L(zero_padding_end):
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl v18,r11,r10 /* Partial store */
+ blr
+
+L(n_tail):
+
+END (FUNC_NAME)
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
- rawmemchr-power9 strlen-power9
+ rawmemchr-power9 strlen-power9 strncpy-power9
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
@@ -301,6 +301,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */
IFUNC_IMPL (i, name, strncpy,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, strncpy,
+ hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ __strncpy_power9)
+#endif
IFUNC_IMPL_ADD (array, i, strncpy,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strncpy_power8)
new file mode 100644
@@ -0,0 +1,26 @@
+/* Optimized strncpy implementation for POWER9/PPC64.
+ Copyright (C) 2016-2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRNCPY __strncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+#endif
@@ -28,11 +28,18 @@
extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
+# endif
# undef strncpy
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc_redirected (__redirect_strncpy, strncpy,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ ? __strncpy_power9 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strncpy_power8
: (hwcap & PPC_FEATURE_HAS_VSX)