diff mbox series

[v2,1/2] powerpc: Add optimized strncpy for POWER9

Message ID 20200904165653.16202-1-rzinsly@linux.ibm.com
State New
Headers show
Series [v2,1/2] powerpc: Add optimized strncpy for POWER9 | expand

Commit Message

Raphael Moreira Zinsly Sept. 4, 2020, 4:56 p.m. UTC
Changes since v1:
	- Fixed comments identation and added some spaces to improve
	  readbillity.
	- Use "POWER 9 LE" instead of "PowerPC64/POWER9".
	- Fixed copyright dates.
	- Replaced cmpwi for cmpdi.

---8<---

Similar to the strcpy P9 optimization, this version uses VSX to improve
performance.
---
 sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 281 ++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |   5 +
 .../powerpc64/multiarch/strncpy-power9.S      |  26 ++
 sysdeps/powerpc/powerpc64/multiarch/strncpy.c |   7 +
 5 files changed, 320 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S

Comments

Raphael Moreira Zinsly Sept. 4, 2020, 4:59 p.m. UTC | #1
Benchtest output:
                             	generic_strncpy	__strncpy_power9 
__strncpy_power8	__strncpy_power7	__strncpy_ppc
Length   16, n   16, alignment  1/ 1:	6.44861	2.51617	2.54878	5.94753 
9.41467
Length   16, n   16, alignment  1/ 1:	6.4448	2.51688	2.56978	5.86275	9.52956
Length   16, n   16, alignment  1/ 2:	6.51392	2.53026	2.55617	5.96487 
9.51182
Length   16, n   16, alignment  2/ 1:	6.5421	2.5026	2.82458	5.95353	9.36524
Length    2, n    4, alignment  7/ 2:	8.02857	2.19272	4.35397	4.97347 
8.60923
Length    4, n    2, alignment  2/ 7:	6.04262	1.66226	2.31865	3.27123 
6.23803
Length    2, n    4, alignment  7/ 2:	8.15691	2.21924	4.48871	4.97328	8.3591
Length    4, n    2, alignment  2/ 7:	6.0428	1.66435	2.31671	3.2874	6.23902
Length   16, n   16, alignment  2/ 2:	6.75511	2.51667	2.82529	5.65252 
9.32002
Length   16, n   16, alignment  2/ 2:	6.53469	2.51982	2.82678	5.93257 
9.25613
Length   16, n   16, alignment  2/ 4:	6.3502	2.53333	2.82267	5.66948	9.35942
Length   16, n   16, alignment  4/ 2:	6.71533	2.51217	3.47278	5.95821	8.3249
Length    4, n    8, alignment  6/ 4:	7.85332	2.21708	5.68665	4.83111 
9.07271
Length    8, n    4, alignment  4/ 6:	5.93863	1.67938	2.67249	3.07391 
7.90751
Length    4, n    8, alignment  6/ 4:	8.24352	2.16644	5.22268	5.04674 
9.10352
Length    8, n    4, alignment  4/ 6:	5.88514	1.67966	2.67286	3.29382 
7.66757
Length   16, n   16, alignment  3/ 3:	6.55525	2.52511	3.06709	5.95625 
9.23173
Length   16, n   16, alignment  3/ 3:	6.66344	2.50855	3.11771	5.96121 
8.99767
Length   16, n   16, alignment  3/ 6:	6.82163	2.53355	3.0638	5.96451	9.09031
Length   16, n   16, alignment  6/ 3:	6.35636	2.51634	4.17868	5.95112 
7.82576
Length    8, n   16, alignment  5/ 6:	7.46873	2.23953	4.33782	5.76124 
10.2851
Length   16, n    8, alignment  6/ 5:	5.63643	1.88233	2.32899	4.72233 
5.79268
Length    8, n   16, alignment  5/ 6:	7.47291	2.65201	3.9103	5.40334	10.3902
Length   16, n    8, alignment  6/ 5:	5.73738	1.8787	2.32749	4.69061	6.03053
Length   16, n   16, alignment  4/ 4:	6.63998	2.5166	3.5133	5.83764	8.17814
Length   16, n   16, alignment  4/ 4:	6.6866	2.51915	3.5831	5.96121	8.32436
Length   16, n   16, alignment  4/ 0:	6.58543	2.51529	3.38441	5.96909 
8.03797
Length   16, n   16, alignment  0/ 4:	6.6541	1.87852	2.45328	5.96068	7.32961
Length   16, n   32, alignment  4/ 0:	9.37236	3.00744	5.92214	7.25884 
11.1515
Length   32, n   16, alignment  0/ 4:	6.2795	1.87939	2.45688	5.96206	7.03327
Length   16, n   32, alignment  4/ 0:	9.24513	3.00344	5.97977	6.94778 
11.0213
Length   32, n   16, alignment  0/ 4:	6.45422	1.87851	2.45698	5.96172 
7.32939
Length   16, n   16, alignment  5/ 5:	6.53949	2.51619	3.88095	5.96091 
9.05987
Length   16, n   16, alignment  5/ 5:	6.47371	2.51703	3.91695	5.96417 
9.24674
Length   16, n   16, alignment  5/ 2:	6.5493	2.5163	3.78779	5.95898	9.44104
Length   16, n   16, alignment  2/ 5:	6.70967	2.52226	2.82034	5.96365 
9.37646
Length   32, n   64, alignment  3/ 2:	14.0298	3.74521	6.80923	11.2825 
12.8659
Length   64, n   32, alignment  2/ 3:	9.53123	2.75624	3.21242	8.51653 
12.6887
Length   32, n   64, alignment  3/ 2:	14.179	3.83256	6.56898	11.3584	15.2479
Length   64, n   32, alignment  2/ 3:	9.53184	2.75305	3.21245	8.37087 
14.1081
Length   16, n   16, alignment  6/ 6:	6.42159	2.51726	4.38574	5.9562	7.12266
Length   16, n   16, alignment  6/ 6:	6.67028	2.51692	4.2448	5.9544	7.81439
Length   16, n   16, alignment  6/ 4:	6.42402	2.51636	4.23817	5.96162 
7.23351
Length   16, n   16, alignment  4/ 6:	6.60107	2.53036	3.54038	5.95837 
8.32176
Length   64, n  128, alignment  2/ 4:	15.5573	4.80414	7.45917	11.5659 
16.9298
Length  128, n   64, alignment  4/ 2:	11.6195	3.53279	4.80585	10.1583 
11.6096
Length   64, n  128, alignment  2/ 4:	15.5233	4.7997	7.34679	11.6628	22.0123
Length  128, n   64, alignment  4/ 2:	11.6078	3.5492	4.77929	10.027	19.504
Length   16, n   16, alignment  7/ 7:	6.54515	2.5141	5.04928	5.95083	7.57587
Length   16, n   16, alignment  7/ 7:	7.00425	2.51299	5.06765	5.92888 
8.25286
Length   16, n   16, alignment  7/ 6:	6.62954	2.51922	5.07189	6.02372 
7.72968
Length   16, n   16, alignment  6/ 7:	6.34475	2.51841	4.36954	5.95968 
7.78498
Length  128, n  256, alignment  1/ 6:	17.9386	7.60767	9.40348	16.5301 
20.6134
Length  256, n  128, alignment  6/ 1:	13.373	4.84375	7.34616	12.3919	15.1296
Length  128, n  256, alignment  1/ 6:	17.9186	7.6077	9.37853	16.686	39.2821
Length  256, n  128, alignment  6/ 1:	13.3632	4.91799	8.06183	12.4174 
34.1655
Length    8, n   16, alignment  0/ 0:	7.36981	2.22579	4.22739	4.9063	7.24636
Length   32, n   16, alignment  0/ 0:	6.43465	1.87932	2.45308	2.41526	7.1679
Length    8, n   16, alignment  7/ 2:	7.48861	2.21639	3.75708	5.35882 
8.45777
Length   32, n   16, alignment  7/ 2:	7.03412	2.3535	5.04692	5.95484	7.25068
Length   16, n   32, alignment  0/ 0:	9.10177	3.06646	4.81682	4.41358 
9.89656
Length   64, n   32, alignment  0/ 0:	8.57287	2.53847	2.94869	2.70506	8.2629
Length   16, n   32, alignment  6/ 4:	9.20906	3.04216	6.37553	9.46301 
10.2489
Length   64, n   32, alignment  6/ 4:	9.73117	2.75023	4.49311	7.7856	9.59261
Length   32, n   64, alignment  0/ 0:	10.9253	3.80104	4.83111	4.97682 
12.1086
Length  128, n   64, alignment  0/ 0:	9.26987	3.15895	3.49112	4.31372 
10.1329
Length   32, n   64, alignment  5/ 6:	14.1856	3.78089	7.1768	9.63551	13.9944
Length  128, n   64, alignment  5/ 6:	11.5298	3.5249	5.07847	9.96481	12.8245
Length   64, n  128, alignment  0/ 0:	12.0142	4.73085	5.98759	7.1613	15.0462
Length  256, n  128, alignment  0/ 0:	7.96029	4.50244	6.44433	5.38248 
11.6022
Length   64, n  128, alignment  4/ 0:	12.4223	4.80085	7.79294	11.0101 
15.5277
Length  256, n  128, alignment  4/ 0:	12.2371	4.79242	6.83902	13.2758 
16.0479
Length  128, n  256, alignment  0/ 0:	13.9165	7.28703	8.13319	8.79111 
16.9101
Length  512, n  256, alignment  0/ 0:	10.5083	6.49881	9.05173	9.03139 
19.6212
Length  128, n  256, alignment  3/ 2:	18.025	7.45493	9.86636	18.7234	20.5106
Length  512, n  256, alignment  3/ 2:	16.9588	7.07807	9.97969	23.4911 
25.4407
Length  256, n  512, alignment  0/ 0:	17.6801	12.5811	15.3595	13.9989 
28.5549
Length 1024, n  512, alignment  0/ 0:	16.379	10.7794	16.4748	16.7344	37.8286
Length  256, n  512, alignment  2/ 4:	23.2012	13.2761	14.3776	26.3752 
31.6336
Length 1024, n  512, alignment  2/ 4:	25.4264	12.1716	17.2608	42.2122	47.425
Length  512, n 1024, alignment  0/ 0:	21.0239	23.0736	19.8285	21.0169 
48.0091
Length 2048, n 1024, alignment  0/ 0:	28.424	19.323	36.917	35.4247	68.1661
Length  512, n 1024, alignment  1/ 6:	32.3159	24.2617	21.4919	46.5936	55.163
Length 2048, n 1024, alignment  1/ 6:	43.0359	21.6207	37.7643	77.5705 
83.2998
Matheus Castanho Sept. 16, 2020, 12:24 p.m. UTC | #2
On 9/4/20 1:56 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Changes since v1:
> 	- Fixed comments identation and added some spaces to improve
> 	  readbillity.
> 	- Use "POWER 9 LE" instead of "PowerPC64/POWER9".
> 	- Fixed copyright dates.
> 	- Replaced cmpwi for cmpdi.
> 
> ---8<---
> 
> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.
> ---
>  sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 281 ++++++++++++++++++
>  sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
>  .../powerpc64/multiarch/ifunc-impl-list.c     |   5 +
>  .../powerpc64/multiarch/strncpy-power9.S      |  26 ++
>  sysdeps/powerpc/powerpc64/multiarch/strncpy.c |   7 +
>  5 files changed, 320 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
>  create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..34fcdee913
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,281 @@
> +/* Optimized strncpy implementation for POWER9 LE.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +# ifndef STRNCPY
> +#  define FUNC_NAME strncpy
> +# else
> +#  define FUNC_NAME STRNCPY
> +# endif
> +
> +/* Implements the function
> +
> +   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> +   The implementation can load bytes past a null terminator, but only
> +   up to the next 16-byte aligned address, so it never crosses a page.  */
> +
> +.machine power9
> +ENTRY_TOCLESS (FUNC_NAME, 4)
> +	CALL_MCOUNT 2
> +
> +	/* NULL string optimizations  */
> +	cmpdi   r5, 0
> +	beqlr
> +
> +	lbz	r0,0(r4)
> +	stb	r0,0(r3)
> +	addi	r11,r3,1
> +	addi	r5,r5,-1
> +	vspltisb v18,0		/* Zeroes in v18  */
> +	cmpdi	r0,0
> +	beq	L(zero_padding_loop)
> +
> +	/* Empty/1-byte string optimization  */
> +	cmpdi	r5,0
> +	beqlr
> +
> +	addi	r4,r4,1
> +	neg	r7,r4
> +	rldicl	r9,r7,0,60	/* How many bytes to get source 16B aligned?  */
> +
> +	/* Get source 16B aligned  */
> +	lvx	v0,0,r4
> +	lvsr	v1,0,r4
> +	vperm	v0,v18,v0,v1
> +
> +	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	vctzlsbb r7,v6		/* Number of trailing zeroes  */
> +	addi	r8,r7,1		/* Add null terminator  */
> +
> +	/* r8 = bytes including null
> +	   r9 = bytes to get source 16B aligned
> +	   if r8 > r9
> +	      no null, copy r9 bytes
> +	   else
> +	      there is a null, copy r8 bytes and return.  */
> +	cmpld	r8,r9
> +	bgt	L(no_null)
> +
> +	cmpld	cr6,r8,r5	/* r8 <= n?  */
> +	ble	cr6,L(null)
> +
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */

At first I was confused by this 32+vX syntax. Maybe we could consider
adding defines for VSX registers to sysdeps/powerpc/sysdep.h in the
future? This way we could refer to v0+32 as vs32, for example. But I
don't think this needs to be part of this patchset.

> +
> +	blr
> +
> +L(null):
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	add	r11,r11,r8
> +	sub	r5,r5,r8
> +	b L(zero_padding_loop)
> +
> +L(no_null):
> +	cmpld	r9,r5		/* Check if length was reached.  */
> +	bge	L(n_tail1)
> +
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	add	r4,r4,r9
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +
> +L(loop):
> +	cmpldi	cr6,r5,64	/* Check if length was reached.  */
> +	ble	cr6,L(final_loop)
> +
> +	lxv	32+v0,0(r4)
> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail1)
> +
> +	lxv	32+v1,16(r4)
> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail2)
> +
> +	lxv	32+v2,32(r4)
> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail3)
> +
> +	lxv	32+v3,48(r4)
> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail4)
> +
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	stxv	32+v3,48(r11)
> +
> +	addi	r4,r4,64
> +	addi	r11,r11,64
> +	addi	r5,r5,-64
> +
> +	b	L(loop)
> +
> +L(final_loop):
> +	cmpldi	cr5,r5,16
> +	lxv	32+v0,0(r4)
> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail1)
> +	bne	cr6,L(count_tail1)
> +	addi	r5,r5,-16
> +
> +	cmpldi	cr5,r5,16
> +	lxv	32+v1,16(r4)
> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail2)
> +	bne	cr6,L(count_tail2)
> +	addi	r5,r5,-16
> +
> +	cmpldi	cr5,r5,16
> +	lxv	32+v2,32(r4)
> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail3)
> +	bne	cr6,L(count_tail3)
> +	addi	r5,r5,-16
> +
> +	lxv	32+v3,48(r4)
> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> +	beq	cr6,L(n_tail4)
> +
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpld	r8,r5		/* r8 < n?  */
> +	blt	L(tail4)
> +
> +L(n_tail4):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,48	/* Offset */
> +	stxvl	32+v3,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_n_tail1):
> +	beq	cr6,L(n_tail1)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpld	r8,r5		/* r8 < n?  */
> +	blt	L(tail1)
> +
> +L(n_tail1):
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_n_tail2):
> +	beq	cr6,L(n_tail2)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpld	r8,r5		/* r8 < n?  */
> +	blt	L(tail2)
> +
> +L(n_tail2):
> +	stxv	32+v0,0(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,16	/* offset */
> +	stxvl	32+v1,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_n_tail3):
> +	beq	cr6,L(n_tail3)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpld	r8,r5		/* r8 < n?  */
> +	blt	L(tail3)
> +
> +L(n_tail3):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,32	/* Offset */
> +	stxvl	32+v2,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_tail1):
> +L(count_tail1):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail1):
> +	addi	r9,r8,1		/* Add null terminator  */
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail2):
> +	addi	r5,r5,-16
> +L(count_tail2):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail2):
> +	addi	r9,r8,1		/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,16	/* offset */
> +	stxvl	32+v1,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail3):
> +	addi	r5,r5,-32
> +L(count_tail3):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail3):
> +	addi	r9,r8,1		/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,32	/* offset */
> +	stxvl	32+v2,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail4):
> +	addi	r5,r5,-48
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail4):
> +	addi	r9,r8,1		/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,48	/* offset */
> +	stxvl	32+v3,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +
> +/* This code pads the remainder of dest with NULL bytes.  */
> +L(zero_padding_loop):
> +	cmpldi	cr6,r5,16	/* Check if length was reached.  */
> +	ble	cr6,L(zero_padding_end)
> +
> +	stxv	v18,0(r11)
> +	addi	r11,r11,16
> +	addi	r5,r5,-16
> +
> +	b	L(zero_padding_loop)
> +
> +L(zero_padding_end):
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	v18,r11,r10	/* Partial store  */
> +	blr
> +

The logic looks good. I tried to find a way to reuse some code, as there
are many similar blocks (e.g. tail* blocks). But their slight
differences make it hard to reuse anything.

> +END (FUNC_NAME)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index 19acb6c64a..cd2b47b403 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
>  
>  ifneq (,$(filter %le,$(config-machine)))
>  sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
> -		   rawmemchr-power9 strlen-power9
> +		   rawmemchr-power9 strlen-power9 strncpy-power9
>  endif
>  CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
>  CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index ea10b00417..aa63e1c23f 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -301,6 +301,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
>    IFUNC_IMPL (i, name, strncpy,
> +#ifdef __LITTLE_ENDIAN__
> +	      IFUNC_IMPL_ADD (array, i, strncpy,
> +			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
> +			      __strncpy_power9)
> +#endif
>  	      IFUNC_IMPL_ADD (array, i, strncpy,
>  			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
>  			      __strncpy_power8)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> new file mode 100644
> index 0000000000..ab7c570d54
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> @@ -0,0 +1,26 @@
> +/* Optimized strncpy implementation for POWER9 LE.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
> +#define STRNCPY __strncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
> +#endif
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> index 7bacf28aca..8ef0a99cb5 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> @@ -28,11 +28,18 @@
>  extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
>  extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
>  extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
> +# endif
>  # undef strncpy
>  
>  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>   ifunc symbol properly. */
>  libc_ifunc_redirected (__redirect_strncpy, strncpy,
> +# ifdef __LITTLE_ENDIAN__
> +		     (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +		     ? __strncpy_power9 :
> +# endif
>  		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
>  		       ? __strncpy_power8
>  		       : (hwcap & PPC_FEATURE_HAS_VSX)
> 

--

The only thing missing now seems to be the .machine power9 issue that
was pointed out in v1.

Otherwise, LGTM.

Reviewed-by: Matheus Castanho <msc@linux.ibm.com>

--
Matheus Castanho
diff mbox series

Patch

diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
new file mode 100644
index 0000000000..34fcdee913
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -0,0 +1,281 @@ 
+/* Optimized strncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+# ifndef STRNCPY
+#  define FUNC_NAME strncpy
+# else
+#  define FUNC_NAME STRNCPY
+# endif
+
+/* Implements the function
+
+   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   The implementation can load bytes past a null terminator, but only
+   up to the next 16-byte aligned address, so it never crosses a page.  */
+
+.machine power9
+ENTRY_TOCLESS (FUNC_NAME, 4)
+	CALL_MCOUNT 2
+
+	/* NULL string optimizations  */
+	cmpdi   r5, 0
+	beqlr
+
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+	addi	r11,r3,1
+	addi	r5,r5,-1
+	vspltisb v18,0		/* Zeroes in v18  */
+	cmpdi	r0,0
+	beq	L(zero_padding_loop)
+
+	/* Empty/1-byte string optimization  */
+	cmpdi	r5,0
+	beqlr
+
+	addi	r4,r4,1
+	neg	r7,r4
+	rldicl	r9,r7,0,60	/* How many bytes to get source 16B aligned?  */
+
+	/* Get source 16B aligned  */
+	lvx	v0,0,r4
+	lvsr	v1,0,r4
+	vperm	v0,v18,v0,v1
+
+	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	vctzlsbb r7,v6		/* Number of trailing zeroes  */
+	addi	r8,r7,1		/* Add null terminator  */
+
+	/* r8 = bytes including null
+	   r9 = bytes to get source 16B aligned
+	   if r8 > r9
+	      no null, copy r9 bytes
+	   else
+	      there is a null, copy r8 bytes and return.  */
+	cmpld	r8,r9
+	bgt	L(no_null)
+
+	cmpld	cr6,r8,r5	/* r8 <= n?  */
+	ble	cr6,L(null)
+
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	blr
+
+L(null):
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	add	r11,r11,r8
+	sub	r5,r5,r8
+	b L(zero_padding_loop)
+
+L(no_null):
+	cmpld	r9,r5		/* Check if length was reached.  */
+	bge	L(n_tail1)
+
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	add	r4,r4,r9
+	add	r11,r11,r9
+	sub	r5,r5,r9
+
+L(loop):
+	cmpldi	cr6,r5,64	/* Check if length was reached.  */
+	ble	cr6,L(final_loop)
+
+	lxv	32+v0,0(r4)
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail1)
+
+	lxv	32+v1,16(r4)
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail2)
+
+	lxv	32+v2,32(r4)
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail3)
+
+	lxv	32+v3,48(r4)
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail4)
+
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+
+	addi	r4,r4,64
+	addi	r11,r11,64
+	addi	r5,r5,-64
+
+	b	L(loop)
+
+L(final_loop):
+	cmpldi	cr5,r5,16
+	lxv	32+v0,0(r4)
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail1)
+	bne	cr6,L(count_tail1)
+	addi	r5,r5,-16
+
+	cmpldi	cr5,r5,16
+	lxv	32+v1,16(r4)
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail2)
+	bne	cr6,L(count_tail2)
+	addi	r5,r5,-16
+
+	cmpldi	cr5,r5,16
+	lxv	32+v2,32(r4)
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail3)
+	bne	cr6,L(count_tail3)
+	addi	r5,r5,-16
+
+	lxv	32+v3,48(r4)
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
+	beq	cr6,L(n_tail4)
+
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail4)
+
+L(n_tail4):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,48	/* Offset */
+	stxvl	32+v3,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail1):
+	beq	cr6,L(n_tail1)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail1)
+
+L(n_tail1):
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail2):
+	beq	cr6,L(n_tail2)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail2)
+
+L(n_tail2):
+	stxv	32+v0,0(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,16	/* offset */
+	stxvl	32+v1,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail3):
+	beq	cr6,L(n_tail3)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail3)
+
+L(n_tail3):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,32	/* Offset */
+	stxvl	32+v2,r11,r10	/* Partial store  */
+	blr
+
+L(prep_tail1):
+L(count_tail1):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail1):
+	addi	r9,r8,1		/* Add null terminator  */
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding_loop)
+
+L(prep_tail2):
+	addi	r5,r5,-16
+L(count_tail2):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail2):
+	addi	r9,r8,1		/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,16	/* offset */
+	stxvl	32+v1,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding_loop)
+
+L(prep_tail3):
+	addi	r5,r5,-32
+L(count_tail3):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail3):
+	addi	r9,r8,1		/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,32	/* offset */
+	stxvl	32+v2,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding_loop)
+
+L(prep_tail4):
+	addi	r5,r5,-48
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail4):
+	addi	r9,r8,1		/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,48	/* offset */
+	stxvl	32+v3,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+
+/* This code pads the remainder of dest with NULL bytes.  */
+L(zero_padding_loop):
+	cmpldi	cr6,r5,16	/* Check if length was reached.  */
+	ble	cr6,L(zero_padding_end)
+
+	stxv	v18,0(r11)
+	addi	r11,r11,16
+	addi	r5,r5,-16
+
+	b	L(zero_padding_loop)
+
+L(zero_padding_end):
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	v18,r11,r10	/* Partial store  */
+	blr
+
+END (FUNC_NAME)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 19acb6c64a..cd2b47b403 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@  sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
-		   rawmemchr-power9 strlen-power9
+		   rawmemchr-power9 strlen-power9 strncpy-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index ea10b00417..aa63e1c23f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -301,6 +301,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __strncpy_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, strncpy,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
new file mode 100644
index 0000000000..ab7c570d54
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
@@ -0,0 +1,26 @@ 
+/* Optimized strncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRNCPY __strncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index 7bacf28aca..8ef0a99cb5 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -28,11 +28,18 @@ 
 extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
 extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
 extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
+# endif
 # undef strncpy
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
  ifunc symbol properly. */
 libc_ifunc_redirected (__redirect_strncpy, strncpy,
+# ifdef __LITTLE_ENDIAN__
+		     (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+		     ? __strncpy_power9 :
+# endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __strncpy_power8
 		       : (hwcap & PPC_FEATURE_HAS_VSX)