Patchwork [1/2] Hoist ZVA check out of the function

login
register
mail settings
Submitter Siddhesh Poyarekar
Date Sept. 19, 2017, 3:23 p.m.
Message ID <1505834596-21641-1-git-send-email-siddhesh@sourceware.org>
Download mbox | patch
Permalink /patch/22957/
State New
Headers show

Comments

Siddhesh Poyarekar - Sept. 19, 2017, 3:23 p.m.
The DZP bit in the dczid_el0 register does not change dynamically, so
it is safe to read once during program startup.  Hoist the zva check
into an ifunc resolver and store the result into a static variable,
which can be read in case of non-standard zva sizes.  This effectively
adds 3 ifunc variants for memset - one for cases where zva is
disabled, one for 64 byte zva and another for 128 byte zva.  I have
retained the older memset as __memset_generic for internal libc.so use
so that the change impact is minimal.  We should eventually have a
discussion on what is more expensive, reading dczid_el0 on every
memset invocation or the indirection due to PLT.

The gains due to this are significant for falkor, with gains as high
as 80% in some cases.  Likewise for mustang, although the numbers are
slightly lower.  Here's a sample from the falkor tests:

Function: memset
Variant: walk
                                    simple_memset	__memset_nozva	__memset_zva_64	__memset_zva_default	__memset_generic
Siddhesh Poyarekar - Sept. 28, 2017, 2:01 p.m.
Ping.

On Tuesday 19 September 2017 08:53 PM, Siddhesh Poyarekar wrote:
> The DZP bit in the dczid_el0 register does not change dynamically, so
> it is safe to read once during program startup.  Hoist the zva check
> into an ifunc resolver and store the result into a static variable,
> which can be read in case of non-standard zva sizes.  This effectively
> adds 3 ifunc variants for memset - one for cases where zva is
> disabled, one for 64 byte zva and another for 128 byte zva.  I have
> retained the older memset as __memset_generic for internal libc.so use
> so that the change impact is minimal.  We should eventually have a
> discussion on what is more expensive, reading dczid_el0 on every
> memset invocation or the indirection due to PLT.
> 
> The gains due to this are significant for falkor, with gains as high
> as 80% in some cases.  Likewise for mustang, although the numbers are
> slightly lower.  Here's a sample from the falkor tests:
> 
> Function: memset
> Variant: walk
>                                     simple_memset	__memset_nozva	__memset_zva_64	__memset_zva_default	__memset_generic
> ========================================================================================================================
>                   length=256, char=0:         1.82 (-87.26%)	       26.99 ( 89.30%)	       25.49 ( 78.76%)	       23.48 ( 64.65%)	       14.26
>                   length=257, char=0:         1.82 (-87.29%)	       26.97 ( 88.44%)	       25.77 ( 80.12%)	       24.41 ( 70.57%)	       14.31
>                   length=258, char=0:         1.82 (-87.38%)	       26.27 ( 82.29%)	       25.84 ( 79.28%)	       24.33 ( 68.80%)	       14.41
>                   length=259, char=0:         1.82 (-87.36%)	       26.06 ( 81.15%)	       25.72 ( 78.84%)	       24.57 ( 70.80%)	       14.38
>                   length=260, char=0:         1.82 (-87.44%)	       25.35 ( 75.23%)	       25.93 ( 79.23%)	       24.34 ( 68.24%)	       14.47
>                   length=261, char=0:         1.82 (-87.49%)	       26.15 ( 79.70%)	       26.01 ( 78.72%)	       24.44 ( 67.97%)	       14.55
>                   length=262, char=0:         1.82 (-87.54%)	       25.91 ( 77.31%)	       26.06 ( 78.35%)	       24.33 ( 66.49%)	       14.61
>                   length=263, char=0:         1.82 (-87.54%)	       25.69 ( 75.80%)	       25.96 ( 77.63%)	       24.54 ( 67.90%)	       14.61
>                   length=264, char=0:         1.82 (-87.57%)	       25.31 ( 72.69%)	       26.16 ( 78.43%)	       24.63 ( 68.00%)	       14.66
>                   length=265, char=0:         1.82 (-87.65%)	       25.29 ( 71.35%)	       26.25 ( 77.84%)	       24.58 ( 66.53%)	       14.76
>                   length=266, char=0:         1.82 (-87.69%)	       25.10 ( 69.40%)	       26.15 ( 76.48%)	       24.77 ( 67.22%)	       14.82
>                   length=267, char=0:         1.82 (-87.69%)	       24.89 ( 68.02%)	       26.20 ( 76.90%)	       24.87 ( 67.91%)	       14.81
>                   length=268, char=0:         1.82 (-87.74%)	       24.07 ( 62.04%)	       26.40 ( 77.74%)	       24.95 ( 67.93%)	       14.85
>                   length=269, char=0:         1.82 (-87.80%)	       23.82 ( 59.29%)	       26.47 ( 77.00%)	       24.89 ( 66.43%)	       14.96
>                   length=270, char=0:         1.82 (-87.84%)	       23.65 ( 57.61%)	       26.35 ( 75.58%)	       25.07 ( 67.07%)	       15.01
>                   length=271, char=0:         1.83 (-87.82%)	       23.48 ( 56.53%)	       26.39 ( 75.93%)	       25.15 ( 67.66%)	       15.00
>                   length=512, char=0:         1.90 (-92.59%)	       29.25 ( 13.81%)	       36.30 ( 41.27%)	       40.95 ( 59.36%)	       25.70
>                   length=513, char=0:         1.90 (-92.57%)	       29.29 ( 14.35%)	       36.63 ( 43.01%)	       40.80 ( 59.28%)	       25.61
>                   length=514, char=0:         1.90 (-92.62%)	       28.61 ( 10.91%)	       36.64 ( 42.05%)	       40.89 ( 58.52%)	       25.80
>                   length=515, char=0:         1.90 (-92.63%)	       28.74 ( 11.29%)	       36.68 ( 42.06%)	       40.56 ( 57.08%)	       25.82
>                   length=516, char=0:         1.90 (-92.65%)	       28.33 (  9.54%)	       36.72 ( 41.96%)	       40.09 ( 55.01%)	       25.87
>                   length=517, char=0:         1.90 (-92.66%)	       28.41 (  9.60%)	       36.80 ( 41.97%)	       39.43 ( 52.13%)	       25.92
>                   length=518, char=0:         1.90 (-92.66%)	       28.16 (  8.45%)	       36.84 ( 41.89%)	       39.40 ( 51.77%)	       25.96
>                   length=519, char=0:         1.90 (-92.67%)	       28.21 (  8.58%)	       36.86 ( 41.86%)	       40.39 ( 55.46%)	       25.98
>                   length=520, char=0:         1.90 (-92.65%)	       27.53 (  6.32%)	       36.90 ( 42.49%)	       40.80 ( 57.58%)	       25.89
>                   length=521, char=0:         1.90 (-92.69%)	       27.53 (  5.65%)	       36.61 ( 40.50%)	       40.86 ( 56.81%)	       26.05
>                   length=522, char=0:         1.90 (-92.66%)	       27.40 (  5.59%)	       36.95 ( 42.35%)	       40.92 ( 57.64%)	       25.95
>                   length=523, char=0:         1.91 (-92.71%)	       27.50 (  5.29%)	       36.69 ( 40.45%)	       40.97 ( 56.84%)	       26.12
>                   length=524, char=0:         1.90 (-92.69%)	       27.33 (  5.02%)	       37.00 ( 42.18%)	       40.98 ( 57.50%)	       26.02
>                   length=525, char=0:         1.91 (-92.72%)	       27.24 (  4.04%)	       36.70 ( 40.13%)	       41.04 ( 56.72%)	       26.19
>                   length=526, char=0:         1.90 (-92.70%)	       27.06 (  3.73%)	       37.06 ( 42.05%)	       41.08 ( 57.44%)	       26.09
>                   length=527, char=0:         1.91 (-92.74%)	       26.82 (  2.17%)	       37.06 ( 41.17%)	       41.11 ( 56.62%)	       26.25
>                  length=1024, char=0:         1.95 (-95.35%)	       30.55 (-27.12%)	       46.52 ( 10.99%)	       49.89 ( 19.02%)	       41.91
>                  length=1025, char=0:         1.95 (-95.31%)	       30.58 (-26.47%)	       46.57 ( 11.98%)	       49.92 ( 20.05%)	       41.59
>                  length=1026, char=0:         1.95 (-95.36%)	       30.35 (-27.70%)	       46.56 ( 10.92%)	       49.45 ( 17.79%)	       41.98
>                  length=1027, char=0:         1.95 (-95.36%)	       30.24 (-28.02%)	       46.20 (  9.98%)	       49.93 ( 18.88%)	       42.00
>                  length=1028, char=0:         1.95 (-95.37%)	       29.75 (-29.25%)	       46.58 ( 10.76%)	       49.92 ( 18.71%)	       42.05
>                  length=1029, char=0:         1.95 (-95.37%)	       29.78 (-29.24%)	       46.57 ( 10.65%)	       49.96 ( 18.72%)	       42.08
>                  length=1030, char=0:         1.95 (-95.33%)	       29.77 (-28.73%)	       46.63 ( 11.63%)	       49.97 ( 19.64%)	       41.77
>                  length=1031, char=0:         1.95 (-95.37%)	       29.64 (-29.68%)	       46.62 ( 10.59%)	       49.51 ( 17.46%)	       42.15
>                  length=1032, char=0:         1.95 (-95.38%)	       29.60 (-29.80%)	       46.22 (  9.63%)	       49.99 ( 18.58%)	       42.16
>                  length=1033, char=0:         1.95 (-95.38%)	       29.32 (-30.55%)	       46.65 ( 10.49%)	       49.95 ( 18.32%)	       42.22
>                  length=1034, char=0:         1.95 (-95.39%)	       29.45 (-30.31%)	       46.67 ( 10.44%)	       50.01 ( 18.36%)	       42.25
>                  length=1035, char=0:         1.95 (-95.35%)	       29.31 (-30.09%)	       46.68 ( 11.34%)	       50.02 ( 19.31%)	       41.92
>                  length=1036, char=0:         1.95 (-95.40%)	       29.30 (-30.75%)	       46.66 ( 10.27%)	       49.56 ( 17.12%)	       42.32
>                  length=1037, char=0:         1.95 (-95.39%)	       29.17 (-31.08%)	       46.30 (  9.38%)	       50.04 ( 18.22%)	       42.33
>                  length=1038, char=0:         1.95 (-95.40%)	       29.12 (-31.30%)	       46.71 ( 10.19%)	       50.02 ( 18.01%)	       42.39
>                  length=1039, char=0:         1.95 (-95.40%)	       29.19 (-31.20%)	       46.73 ( 10.14%)	       50.06 ( 18.00%)	       42.43
> 
> 	* sysdeps/aarch64/memset.S (do_no_zva): New macro.
> 	(do_zva_64): Likewise.
> 	(do_zva_128): Likewise.
> 	(do_zva_default): Likewise.
> 	(__memset): Rename to MEMSET macro.
> 	(MEMSET): Use the new macros.
> 	(MEMSET)[INTERNAL_MEMSET]: Retain old memset.
> 	(MEMSET)[!INTERNAL_MEMSET]: Remove zva check.
> 	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
> 	Add memset_generic, memset_nozva and memset_zva.
> 	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
> 	(__libc_ifunc_impl_list): Add memset ifuncs.
> 	* sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
> 	static variable __aarch64_zva_size and local variable
> 	zva_size.
> 	* sysdeps/aarch64/multiarch/memset.c: New file.
> 	* sysdeps/aarch64/multiarch/memset_generic.S: New file.
> 	* sysdeps/aarch64/multiarch/memset_nozva.S: New file.
> 	* sysdeps/aarch64/multiarch/memset_zva.S: New file.
> 	* sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> 	(DCZID_DZP_MASK): New macro.
> 	(DCZID_BS_MASK): Likewise.
> 	(init_cpu_features): Read and set zva_size.
> 	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> 	(struct cpu_features): New member zva_size.
> ---
>  sysdeps/aarch64/memset.S                       | 248 +++++++++++++++----------
>  sysdeps/aarch64/multiarch/Makefile             |   2 +-
>  sysdeps/aarch64/multiarch/ifunc-impl-list.c    |   6 +
>  sysdeps/aarch64/multiarch/init-arch.h          |   9 +-
>  sysdeps/aarch64/multiarch/memset.c             |  47 +++++
>  sysdeps/aarch64/multiarch/memset_generic.S     |  27 +++
>  sysdeps/aarch64/multiarch/memset_nozva.S       |  22 +++
>  sysdeps/aarch64/multiarch/memset_zva.S         |  41 ++++
>  sysdeps/unix/sysv/linux/aarch64/cpu-features.c |  10 +
>  sysdeps/unix/sysv/linux/aarch64/cpu-features.h |   1 +
>  10 files changed, 313 insertions(+), 100 deletions(-)
>  create mode 100644 sysdeps/aarch64/multiarch/memset.c
>  create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S
>  create mode 100644 sysdeps/aarch64/multiarch/memset_nozva.S
>  create mode 100644 sysdeps/aarch64/multiarch/memset_zva.S
> 
> diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
> index 110fd22..8cff3a4 100644
> --- a/sysdeps/aarch64/memset.S
> +++ b/sysdeps/aarch64/memset.S
> @@ -37,7 +37,108 @@
>  #define zva_len x7
>  #define zva_lenw w7
>  
> -ENTRY_ALIGN (__memset, 6)
> +/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128
> +   bytes and higher sizes.  */
> +
> +#ifndef ZVA_MACROS
> +# define ZVA_MACROS
> +/* No ZVA.  */
> +.macro do_no_zva
> +	sub	count, dstend, dst	/* Count is 16 too large.  */
> +	add	dst, dst, 16
> +	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
> +1:	stp	q0, q0, [dst], 64
> +	stp	q0, q0, [dst, -32]
> +	subs	count, count, 64
> +	b.hi	1b
> +	stp	q0, q0, [dstend, -64]
> +	stp	q0, q0, [dstend, -32]
> +	ret
> +.endm
> +
> +/* Write the first and last 64 byte aligned block using stp rather
> +   than using DC ZVA.  This is faster on some cores.  */
> +.macro do_zva_64
> +	str	q0, [dst, 16]
> +	stp	q0, q0, [dst, 32]
> +	bic	dst, dst, 63
> +	stp	q0, q0, [dst, 64]
> +	stp	q0, q0, [dst, 96]
> +	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> +	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
> +	add	dst, dst, 128
> +	nop
> +1:	dc	zva, dst
> +	add	dst, dst, 64
> +	subs	count, count, 64
> +	b.hi	1b
> +	stp	q0, q0, [dst, 0]
> +	stp	q0, q0, [dst, 32]
> +	stp	q0, q0, [dstend, -64]
> +	stp	q0, q0, [dstend, -32]
> +	ret
> +.endm
> +
> +/* ZVA size of 128 bytes.  */
> +.macro do_zva_128
> +	str	q0, [dst, 16]
> +	stp	q0, q0, [dst, 32]
> +	stp	q0, q0, [dst, 64]
> +	stp	q0, q0, [dst, 96]
> +	bic	dst, dst, 127
> +	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> +	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
> +	add	dst, dst, 128
> +1:	dc	zva, dst
> +	add	dst, dst, 128
> +	subs	count, count, 128
> +	b.hi	1b
> +	stp	q0, q0, [dstend, -128]
> +	stp	q0, q0, [dstend, -96]
> +	stp	q0, q0, [dstend, -64]
> +	stp	q0, q0, [dstend, -32]
> +	ret
> +.endm
> +
> +/* ZVA size of more than 128 bytes.  */
> +.macro do_zva_default
> +	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
> +	cmp	count, tmp1
> +	blo	MEMSET_L(no_zva)
> +
> +	sub	tmp2, zva_len, 1
> +	add	tmp1, dst, zva_len
> +	add	dst, dst, 16
> +	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
> +	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
> +	beq	2f
> +1:	stp	q0, q0, [dst], 64
> +	stp	q0, q0, [dst, -32]
> +	subs	count, count, 64
> +	b.hi	1b
> +2:	mov	dst, tmp1
> +	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
> +	subs	count, count, zva_len
> +	b.lo	4f
> +3:	dc	zva, dst
> +	add	dst, dst, zva_len
> +	subs	count, count, zva_len
> +	b.hs	3b
> +4:	add	count, count, zva_len
> +	subs	count, count, 64
> +	b.ls	6f
> +5:	stp	q0, q0, [dst], 64
> +	stp	q0, q0, [dst, -32]
> +	subs	count, count, 64
> +	b.hi	5b
> +6:	stp	q0, q0, [dstend, -64]
> +	stp	q0, q0, [dstend, -32]
> +	ret
> +.endm
> +#endif
> +
> +/* Memset entry point.  */
> +ENTRY_ALIGN (MEMSET, 6)
>  
>  	DELOUSE (0)
>  	DELOUSE (2)
> @@ -46,9 +147,9 @@ ENTRY_ALIGN (__memset, 6)
>  	add	dstend, dstin, count
>  
>  	cmp	count, 96
> -	b.hi	L(set_long)
> +	b.hi	MEMSET_L(set_long)
>  	cmp	count, 16
> -	b.hs	L(set_medium)
> +	b.hs	MEMSET_L(set_medium)
>  	mov	val, v0.D[0]
>  
>  	/* Set 0..15 bytes.  */
> @@ -68,9 +169,9 @@ ENTRY_ALIGN (__memset, 6)
>  3:	ret
>  
>  	/* Set 17..96 bytes.  */
> -L(set_medium):
> +MEMSET_L(set_medium):
>  	str	q0, [dstin]
> -	tbnz	count, 6, L(set96)
> +	tbnz	count, 6, MEMSET_L(set96)
>  	str	q0, [dstend, -16]
>  	tbz	count, 5, 1f
>  	str	q0, [dstin, 16]
> @@ -80,7 +181,7 @@ L(set_medium):
>  	.p2align 4
>  	/* Set 64..96 bytes.  Write 64 bytes from the start and
>  	   32 bytes from the end.  */
> -L(set96):
> +MEMSET_L(set96):
>  	str	q0, [dstin, 16]
>  	stp	q0, q0, [dstin, 32]
>  	stp	q0, q0, [dstend, -32]
> @@ -88,108 +189,63 @@ L(set96):
>  
>  	.p2align 3
>  	nop
> -L(set_long):
> +MEMSET_L(set_long):
> +#ifdef INTERNAL_MEMSET
>  	and	valw, valw, 255
>  	bic	dst, dstin, 15
>  	str	q0, [dstin]
>  	cmp	count, 256
>  	ccmp	valw, 0, 0, cs
> -	b.eq	L(try_zva)
> -L(no_zva):
> -	sub	count, dstend, dst	/* Count is 16 too large.  */
> -	add	dst, dst, 16
> -	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
> -1:	stp	q0, q0, [dst], 64
> -	stp	q0, q0, [dst, -32]
> -L(tail64):
> -	subs	count, count, 64
> -	b.hi	1b
> -2:	stp	q0, q0, [dstend, -64]
> -	stp	q0, q0, [dstend, -32]
> -	ret
> +	b.eq	MEMSET_L(try_zva)
>  
> -	.p2align 3
> -L(try_zva):
> +MEMSET_L(no_zva):
> +	do_no_zva
> +
> +	.p2align 4
> +MEMSET_L(try_zva):
>  	mrs	tmp1, dczid_el0
> -	tbnz	tmp1w, 4, L(no_zva)
>  	and	tmp1w, tmp1w, 15
>  	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
> -	b.ne	 L(zva_128)
> +	b.ne	 MEMSET_L(zva_128)
> +	do_zva_64
>  
> -	/* Write the first and last 64 byte aligned block using stp rather
> -	   than using DC ZVA.  This is faster on some cores.
> -	 */
> -L(zva_64):
> -	str	q0, [dst, 16]
> -	stp	q0, q0, [dst, 32]
> -	bic	dst, dst, 63
> -	stp	q0, q0, [dst, 64]
> -	stp	q0, q0, [dst, 96]
> -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> -	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
> -	add	dst, dst, 128
> -	nop
> -1:	dc	zva, dst
> -	add	dst, dst, 64
> -	subs	count, count, 64
> -	b.hi	1b
> -	stp	q0, q0, [dst, 0]
> -	stp	q0, q0, [dst, 32]
> -	stp	q0, q0, [dstend, -64]
> -	stp	q0, q0, [dstend, -32]
> -	ret
> -
> -	.p2align 3
> -L(zva_128):
> +MEMSET_L(zva_128):
>  	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
> -	b.ne	L(zva_other)
> +	b.ne	MEMSET_L(zva_other)
> +	do_zva_128
>  
> -	str	q0, [dst, 16]
> -	stp	q0, q0, [dst, 32]
> -	stp	q0, q0, [dst, 64]
> -	stp	q0, q0, [dst, 96]
> -	bic	dst, dst, 127
> -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> -	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
> -	add	dst, dst, 128
> -1:	dc	zva, dst
> -	add	dst, dst, 128
> -	subs	count, count, 128
> -	b.hi	1b
> -	stp	q0, q0, [dstend, -128]
> -	stp	q0, q0, [dstend, -96]
> -	stp	q0, q0, [dstend, -64]
> -	stp	q0, q0, [dstend, -32]
> -	ret
> -
> -L(zva_other):
> +MEMSET_L(zva_other):
>  	mov	tmp2w, 4
>  	lsl	zva_lenw, tmp2w, tmp1w
> -	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
> -	cmp	count, tmp1
> -	blo	L(no_zva)
> +	do_zva_default
> +#else
> +	/* Memset called through PLT, so we need only one of the ZVA
> +	   variants.  */
> +# ifdef MEMSET_ZVA
> +	and	valw, valw, 255
> +# endif
> +	bic	dst, dstin, 15
> +	str	q0, [dstin]
> +# ifdef MEMSET_ZVA
> +	cmp	count, 256
> +	ccmp	valw, 0, 0, cs
> +	b.eq	MEMSET_L(try_zva)
> +# endif
> +MEMSET_L(no_zva):
> +	do_no_zva
> +# if defined MEMSET_ZVA
> +MEMSET_L(try_zva):
> +#  if MEMSET_ZVA == 64
> +	do_zva_64
> +#  elif MEMSET_ZVA == 128
> +	do_zva_128
> +#  else
> +	adrp	zva_len, __aarch64_zva_size
> +	ldr	zva_len, [zva_len, #:lo12:__aarch64_zva_size]
> +	do_zva_default
> +#  endif
> +# endif
> +#endif
>  
> -	sub	tmp2, zva_len, 1
> -	add	tmp1, dst, zva_len
> -	add	dst, dst, 16
> -	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
> -	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
> -	beq	2f
> -1:	stp	q0, q0, [dst], 64
> -	stp	q0, q0, [dst, -32]
> -	subs	count, count, 64
> -	b.hi	1b
> -2:	mov	dst, tmp1
> -	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
> -	subs	count, count, zva_len
> -	b.lo	4f
> -3:	dc	zva, dst
> -	add	dst, dst, zva_len
> -	subs	count, count, zva_len
> -	b.hs	3b
> -4:	add	count, count, zva_len
> -	b	L(tail64)
> -
> -END (__memset)
> -weak_alias (__memset, memset)
> -libc_hidden_builtin_def (memset)
> +END (MEMSET)
> +libc_hidden_builtin_def (MEMSET)
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index 9aa1e79..f611182 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -1,4 +1,4 @@
>  ifeq ($(subdir),string)
>  sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
> -		   memmove_falkor
> +		   memmove_falkor memset_generic memset_nozva memset_zva
>  endif
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 2cb74d5..29148ac 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -46,6 +46,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
> +  IFUNC_IMPL (i, name, memset,
> +	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva)
> +	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64)
> +	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128)
> +	      IFUNC_IMPL_ADD (array, i, memset, (zva_size > 0), __memset_zva_default)
> +	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
>  
>    return i;
>  }
> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> index 3af442c..541c27e 100644
> --- a/sysdeps/aarch64/multiarch/init-arch.h
> +++ b/sysdeps/aarch64/multiarch/init-arch.h
> @@ -18,6 +18,9 @@
>  
>  #include <ldsodefs.h>
>  
> -#define INIT_ARCH()				\
> -  uint64_t __attribute__((unused)) midr =	\
> -    GLRO(dl_aarch64_cpu_features).midr_el1;
> +#define INIT_ARCH()							      \
> +  uint64_t __attribute__((unused)) midr =				      \
> +    GLRO(dl_aarch64_cpu_features).midr_el1;				      \
> +  extern unsigned __aarch64_zva_size;					      \
> +  unsigned __attribute__((unused)) zva_size = __aarch64_zva_size =	      \
> +    GLRO(dl_aarch64_cpu_features).zva_size;
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> new file mode 100644
> index 0000000..58e669a
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -0,0 +1,47 @@
> +/* Multiple versions of memset. AARCH64 version.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +
> +#if IS_IN (libc)
> +/* Redefine memset so that the compiler won't complain about the type
> +   mismatch with the IFUNC selector in strong_alias, below.  */
> +# undef memset
> +# define memset __redirect_memset
> +# include <string.h>
> +# include <init-arch.h>
> +
> +unsigned __aarch64_zva_size;
> +
> +extern __typeof (__redirect_memset) __libc_memset;
> +
> +extern __typeof (__redirect_memset) __memset_nozva attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_default attribute_hidden;
> +
> +libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva
> +			    : (zva_size == 64 ? __memset_zva_64
> +			       : (zva_size == 128 ? __memset_zva_128
> +				  : __memset_zva_default))));
> +
> +# undef memset
> +strong_alias (__libc_memset, memset);
> +#else
> +#include <string/memset.c>
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
> new file mode 100644
> index 0000000..56f1e02
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_generic.S
> @@ -0,0 +1,27 @@
> +/* Memset for aarch64, default version for internal use.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define MEMSET __memset_generic
> +#define INTERNAL_MEMSET
> +#define MEMSET_L(label) L(label)
> +#ifdef SHARED
> +	.globl __GI_memset; __GI_memset = __memset_generic
> +#endif
> +
> +#include <sysdeps/aarch64/memset.S>
> diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S
> new file mode 100644
> index 0000000..98045ac
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_nozva.S
> @@ -0,0 +1,22 @@
> +/* Memset for aarch64, ZVA disabled.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define MEMSET __memset_nozva
> +#define MEMSET_L(label) L(label)
> +#include <sysdeps/aarch64/memset.S>
> diff --git a/sysdeps/aarch64/multiarch/memset_zva.S b/sysdeps/aarch64/multiarch/memset_zva.S
> new file mode 100644
> index 0000000..5d02b89
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_zva.S
> @@ -0,0 +1,41 @@
> +/* Memset for aarch64, ZVA enabled.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +# define MEMSET __memset_zva_64
> +# define MEMSET_ZVA 64
> +# define MEMSET_L(label) L(label ## _zva64)
> +# include <sysdeps/aarch64/memset.S>
> +
> +# undef MEMSET
> +# undef MEMSET_ZVA
> +# undef MEMSET_L
> +# define MEMSET __memset_zva_128
> +# define MEMSET_ZVA 128
> +# define MEMSET_L(label) L(label ## _zva128)
> +# include <sysdeps/aarch64/memset.S>
> +
> +# undef MEMSET
> +# undef MEMSET_ZVA
> +# undef MEMSET_L
> +# define MEMSET __memset_zva_default
> +# define MEMSET_ZVA 1
> +# define MEMSET_L(label) L(label ## _zvadef)
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> index e769eeb..092ee81 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> @@ -20,6 +20,9 @@
>  #include <sys/auxv.h>
>  #include <elf/dl-hwcaps.h>
>  
> +#define DCZID_DZP_MASK (1 << 4)
> +#define DCZID_BS_MASK (0xf)
> +
>  #if HAVE_TUNABLES
>  struct cpu_list
>  {
> @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>      }
>  
>    cpu_features->midr_el1 = midr;
> +
> +  /* Check if ZVA is enabled.  */
> +  unsigned dczid;
> +  asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
> +
> +  if ((dczid & DCZID_DZP_MASK) == 0)
> +    cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
>  }
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> index 73cb53d..f2b6afd 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> @@ -47,6 +47,7 @@
>  struct cpu_features
>  {
>    uint64_t midr_el1;
> +  unsigned zva_size;
>  };
>  
>  #endif /* _CPU_FEATURES_AARCH64_H  */
>
Adhemerval Zanella Netto - Sept. 28, 2017, 7:12 p.m.
On 19/09/2017 08:23, Siddhesh Poyarekar wrote:

> The DZP bit in the dczid_el0 register does not change dynamically, so
> it is safe to read once during program startup.  Hoist the zva check
> into an ifunc resolver and store the result into a static variable,
> which can be read in case of non-standard zva sizes.  This effectively
> adds 3 ifunc variants for memset - one for cases where zva is
> disabled, one for 64 byte zva and another for 128 byte zva.  I have
> retained the older memset as __memset_generic for internal libc.so use
> so that the change impact is minimal.  We should eventually have a
> discussion on what is more expensive, reading dczid_el0 on every
> memset invocation or the indirection due to PLT.
>
> The gains due to this are significant for falkor, with gains as high
> as 80% in some cases.  Likewise for mustang, although the numbers are
> slightly lower.  Here's a sample from the falkor tests:
>
I would use a more compact ChangeLog entry as:

	* sysdeps/aarch64/memset.S (do_no_zva, do_zva_64,
	do_zva_128, do_zva_default): New macro.

Same for the other entries where it applies.

> 	(MEMSET): Use the new macros.
> 	(MEMSET)[INTERNAL_MEMSET]: Retain old memset.
> 	(MEMSET)[!INTERNAL_MEMSET]: Remove zva check.
> 	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
> 	Add memset_generic, memset_nozva and memset_zva.
> 	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
> 	(__libc_ifunc_impl_list): Add memset ifuncs.
> 	* sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
> 	static variable __aarch64_zva_size and local variable
> 	zva_size.
> 	* sysdeps/aarch64/multiarch/memset.c: New file.
> 	* sysdeps/aarch64/multiarch/memset_generic.S: New file.
> 	* sysdeps/aarch64/multiarch/memset_nozva.S: New file.
> 	* sysdeps/aarch64/multiarch/memset_zva.S: New file.
> 	* sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> 	(DCZID_DZP_MASK): New macro.
> 	(DCZID_BS_MASK): Likewise.
> 	(init_cpu_features): Read and set zva_size.
> 	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> 	(struct cpu_features): New member zva_size.
>
>
> diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
> index 110fd22..8cff3a4 100644
> --- a/sysdeps/aarch64/memset.S
> +++ b/sysdeps/aarch64/memset.S
> @@ -37,7 +37,108 @@
>  #define zva_len x7
>  #define zva_lenw w7
>  
> -ENTRY_ALIGN (__memset, 6)
> +/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128
> +   bytes and higher sizes.  */
> +
> +#ifndef ZVA_MACROS
> +# define ZVA_MACROS
> +/* No ZVA.  */
> +.macro do_no_zva
> +	sub	count, dstend, dst	/* Count is 16 too large.  */
> +	add	dst, dst, 16
> +	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
> +1:	stp	q0, q0, [dst], 64
> +	stp	q0, q0, [dst, -32]
> +	subs	count, count, 64
> +	b.hi	1b
> +	stp	q0, q0, [dstend, -64]
> +	stp	q0, q0, [dstend, -32]
> +	ret
> +.endm
> +
> +/* Write the first and last 64 byte aligned block using stp rather
> +   than using DC ZVA.  This is faster on some cores.  */
> +.macro do_zva_64
> +	str	q0, [dst, 16]
> +	stp	q0, q0, [dst, 32]
> +	bic	dst, dst, 63
> +	stp	q0, q0, [dst, 64]
> +	stp	q0, q0, [dst, 96]
> +	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> +	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
> +	add	dst, dst, 128
> +	nop
> +1:	dc	zva, dst
> +	add	dst, dst, 64
> +	subs	count, count, 64
> +	b.hi	1b
> +	stp	q0, q0, [dst, 0]
> +	stp	q0, q0, [dst, 32]
> +	stp	q0, q0, [dstend, -64]
> +	stp	q0, q0, [dstend, -32]
> +	ret
> +.endm
> +
> +/* ZVA size of 128 bytes.  */
> +.macro do_zva_128
> +	str	q0, [dst, 16]
> +	stp	q0, q0, [dst, 32]
> +	stp	q0, q0, [dst, 64]
> +	stp	q0, q0, [dst, 96]
> +	bic	dst, dst, 127
> +	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> +	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
> +	add	dst, dst, 128
> +1:	dc	zva, dst
> +	add	dst, dst, 128
> +	subs	count, count, 128
> +	b.hi	1b
> +	stp	q0, q0, [dstend, -128]
> +	stp	q0, q0, [dstend, -96]
> +	stp	q0, q0, [dstend, -64]
> +	stp	q0, q0, [dstend, -32]
> +	ret
> +.endm
> +
> +/* ZVA size of more than 128 bytes.  */
> +.macro do_zva_default
> +	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
> +	cmp	count, tmp1
> +	blo	MEMSET_L(no_zva)
> +
> +	sub	tmp2, zva_len, 1
> +	add	tmp1, dst, zva_len
> +	add	dst, dst, 16
> +	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
> +	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
> +	beq	2f
> +1:	stp	q0, q0, [dst], 64
> +	stp	q0, q0, [dst, -32]
> +	subs	count, count, 64
> +	b.hi	1b
> +2:	mov	dst, tmp1
> +	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
> +	subs	count, count, zva_len
> +	b.lo	4f
> +3:	dc	zva, dst
> +	add	dst, dst, zva_len
> +	subs	count, count, zva_len
> +	b.hs	3b
> +4:	add	count, count, zva_len
> +	subs	count, count, 64
> +	b.ls	6f
> +5:	stp	q0, q0, [dst], 64
> +	stp	q0, q0, [dst, -32]
> +	subs	count, count, 64
> +	b.hi	5b
> +6:	stp	q0, q0, [dstend, -64]
> +	stp	q0, q0, [dstend, -32]
> +	ret
> +.endm
> +#endif
> +
> +/* Memset entry point.  */
> +ENTRY_ALIGN (MEMSET, 6)
>  
>  	DELOUSE (0)
>  	DELOUSE (2)
> @@ -46,9 +147,9 @@ ENTRY_ALIGN (__memset, 6)
>  	add	dstend, dstin, count
>  
>  	cmp	count, 96
> -	b.hi	L(set_long)
> +	b.hi	MEMSET_L(set_long)
>  	cmp	count, 16
> -	b.hs	L(set_medium)
> +	b.hs	MEMSET_L(set_medium)
>  	mov	val, v0.D[0]
>  
>  	/* Set 0..15 bytes.  */
> @@ -68,9 +169,9 @@ ENTRY_ALIGN (__memset, 6)
>  3:	ret
>  
>  	/* Set 17..96 bytes.  */
> -L(set_medium):
> +MEMSET_L(set_medium):
>  	str	q0, [dstin]
> -	tbnz	count, 6, L(set96)
> +	tbnz	count, 6, MEMSET_L(set96)
>  	str	q0, [dstend, -16]
>  	tbz	count, 5, 1f
>  	str	q0, [dstin, 16]
> @@ -80,7 +181,7 @@ L(set_medium):
>  	.p2align 4
>  	/* Set 64..96 bytes.  Write 64 bytes from the start and
>  	   32 bytes from the end.  */
> -L(set96):
> +MEMSET_L(set96):
>  	str	q0, [dstin, 16]
>  	stp	q0, q0, [dstin, 32]
>  	stp	q0, q0, [dstend, -32]
> @@ -88,108 +189,63 @@ L(set96):
>  
>  	.p2align 3
>  	nop
> -L(set_long):
> +MEMSET_L(set_long):
> +#ifdef INTERNAL_MEMSET
>  	and	valw, valw, 255
>  	bic	dst, dstin, 15
>  	str	q0, [dstin]
>  	cmp	count, 256
>  	ccmp	valw, 0, 0, cs
> -	b.eq	L(try_zva)
> -L(no_zva):
> -	sub	count, dstend, dst	/* Count is 16 too large.  */
> -	add	dst, dst, 16
> -	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
> -1:	stp	q0, q0, [dst], 64
> -	stp	q0, q0, [dst, -32]
> -L(tail64):
> -	subs	count, count, 64
> -	b.hi	1b
> -2:	stp	q0, q0, [dstend, -64]
> -	stp	q0, q0, [dstend, -32]
> -	ret
> +	b.eq	MEMSET_L(try_zva)
>  
> -	.p2align 3
> -L(try_zva):
> +MEMSET_L(no_zva):
> +	do_no_zva
> +
> +	.p2align 4
> +MEMSET_L(try_zva):
>  	mrs	tmp1, dczid_el0
> -	tbnz	tmp1w, 4, L(no_zva)
>  	and	tmp1w, tmp1w, 15
>  	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
> -	b.ne	 L(zva_128)
> +	b.ne	 MEMSET_L(zva_128)
> +	do_zva_64
>  
> -	/* Write the first and last 64 byte aligned block using stp rather
> -	   than using DC ZVA.  This is faster on some cores.
> -	 */
> -L(zva_64):
> -	str	q0, [dst, 16]
> -	stp	q0, q0, [dst, 32]
> -	bic	dst, dst, 63
> -	stp	q0, q0, [dst, 64]
> -	stp	q0, q0, [dst, 96]
> -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> -	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
> -	add	dst, dst, 128
> -	nop
> -1:	dc	zva, dst
> -	add	dst, dst, 64
> -	subs	count, count, 64
> -	b.hi	1b
> -	stp	q0, q0, [dst, 0]
> -	stp	q0, q0, [dst, 32]
> -	stp	q0, q0, [dstend, -64]
> -	stp	q0, q0, [dstend, -32]
> -	ret
> -
> -	.p2align 3
> -L(zva_128):
> +MEMSET_L(zva_128):
>  	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
> -	b.ne	L(zva_other)
> +	b.ne	MEMSET_L(zva_other)
> +	do_zva_128
>  
> -	str	q0, [dst, 16]
> -	stp	q0, q0, [dst, 32]
> -	stp	q0, q0, [dst, 64]
> -	stp	q0, q0, [dst, 96]
> -	bic	dst, dst, 127
> -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> -	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
> -	add	dst, dst, 128
> -1:	dc	zva, dst
> -	add	dst, dst, 128
> -	subs	count, count, 128
> -	b.hi	1b
> -	stp	q0, q0, [dstend, -128]
> -	stp	q0, q0, [dstend, -96]
> -	stp	q0, q0, [dstend, -64]
> -	stp	q0, q0, [dstend, -32]
> -	ret
> -
> -L(zva_other):
> +MEMSET_L(zva_other):
>  	mov	tmp2w, 4
>  	lsl	zva_lenw, tmp2w, tmp1w
> -	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
> -	cmp	count, tmp1
> -	blo	L(no_zva)
> +	do_zva_default
> +#else
> +	/* Memset called through PLT, so we need only one of the ZVA
> +	   variants.  */
> +# ifdef MEMSET_ZVA
> +	and	valw, valw, 255
> +# endif
> +	bic	dst, dstin, 15
> +	str	q0, [dstin]
> +# ifdef MEMSET_ZVA
> +	cmp	count, 256
> +	ccmp	valw, 0, 0, cs
> +	b.eq	MEMSET_L(try_zva)
> +# endif
> +MEMSET_L(no_zva):
> +	do_no_zva
> +# if defined MEMSET_ZVA
> +MEMSET_L(try_zva):
> +#  if MEMSET_ZVA == 64
> +	do_zva_64
> +#  elif MEMSET_ZVA == 128
> +	do_zva_128
> +#  else
> +	adrp	zva_len, __aarch64_zva_size
> +	ldr	zva_len, [zva_len, #:lo12:__aarch64_zva_size]
> +	do_zva_default
> +#  endif
> +# endif
> +#endif
>  
> -	sub	tmp2, zva_len, 1
> -	add	tmp1, dst, zva_len
> -	add	dst, dst, 16
> -	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
> -	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
> -	beq	2f
> -1:	stp	q0, q0, [dst], 64
> -	stp	q0, q0, [dst, -32]
> -	subs	count, count, 64
> -	b.hi	1b
> -2:	mov	dst, tmp1
> -	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
> -	subs	count, count, zva_len
> -	b.lo	4f
> -3:	dc	zva, dst
> -	add	dst, dst, zva_len
> -	subs	count, count, zva_len
> -	b.hs	3b
> -4:	add	count, count, zva_len
> -	b	L(tail64)
> -
> -END (__memset)
> -weak_alias (__memset, memset)
> -libc_hidden_builtin_def (memset)
> +END (MEMSET)
> +libc_hidden_builtin_def (MEMSET)
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index 9aa1e79..f611182 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -1,4 +1,4 @@
>  ifeq ($(subdir),string)
>  sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
> -		   memmove_falkor
> +		   memmove_falkor memset_generic memset_nozva memset_zva
>  endif
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 2cb74d5..29148ac 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -46,6 +46,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
> +  IFUNC_IMPL (i, name, memset,
> +	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva)
> +	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64)
> +	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128)
> +	      IFUNC_IMPL_ADD (array, i, memset, (zva_size > 0), __memset_zva_default)
> +	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
>  
>    return i;
>  }
> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> index 3af442c..541c27e 100644
> --- a/sysdeps/aarch64/multiarch/init-arch.h
> +++ b/sysdeps/aarch64/multiarch/init-arch.h
> @@ -18,6 +18,9 @@
>  
>  #include <ldsodefs.h>
>  
> -#define INIT_ARCH()				\
> -  uint64_t __attribute__((unused)) midr =	\
> -    GLRO(dl_aarch64_cpu_features).midr_el1;
> +#define INIT_ARCH()							      \
> +  uint64_t __attribute__((unused)) midr =				      \
> +    GLRO(dl_aarch64_cpu_features).midr_el1;				      \
> +  extern unsigned __aarch64_zva_size;					      \
> +  unsigned __attribute__((unused)) zva_size = __aarch64_zva_size =	      \
> +    GLRO(dl_aarch64_cpu_features).zva_size;
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> new file mode 100644
> index 0000000..58e669a
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -0,0 +1,47 @@
> +/* Multiple versions of memset. AARCH64 version.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +
> +#if IS_IN (libc)
> +/* Redefine memset so that the compiler won't complain about the type
> +   mismatch with the IFUNC selector in strong_alias, below.  */
> +# undef memset
> +# define memset __redirect_memset
> +# include <string.h>
> +# include <init-arch.h>
> +
> +unsigned __aarch64_zva_size;
> +
> +extern __typeof (__redirect_memset) __libc_memset;
> +
> +extern __typeof (__redirect_memset) __memset_nozva attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_default attribute_hidden;
> +
> +libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva
> +			    : (zva_size == 64 ? __memset_zva_64
> +			       : (zva_size == 128 ? __memset_zva_128
> +				  : __memset_zva_default))));
> +
> +# undef memset
> +strong_alias (__libc_memset, memset);
> +#else
> +#include <string/memset.c>
> +#endif

You don't need use the default version for the loader, you can use the
generic sysdeps/aarch64/memset.S by creating a rtld-memset.S on
multiarch and defining the required macros.


> diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
> new file mode 100644
> index 0000000..56f1e02
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_generic.S
> @@ -0,0 +1,27 @@
> +/* Memset for aarch64, default version for internal use.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define MEMSET __memset_generic
> +#define INTERNAL_MEMSET
> +#define MEMSET_L(label) L(label)
> +#ifdef SHARED
> +	.globl __GI_memset; __GI_memset = __memset_generic
> +#endif

I would add a comment stating it is essentially doing libc_hidden_def(memset)
and redirecting the internal implementation to __memset_generic.

> +
> +#include <sysdeps/aarch64/memset.S>
> diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S
> new file mode 100644
> index 0000000..98045ac
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_nozva.S
> @@ -0,0 +1,22 @@
> +/* Memset for aarch64, ZVA disabled.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define MEMSET __memset_nozva
> +#define MEMSET_L(label) L(label)
> +#include <sysdeps/aarch64/memset.S>

Although not strictly required, I think it should avoid build these
for !IS_IN(libc) as for memset_zva.S. Same applied for memset_generic.S.

> diff --git a/sysdeps/aarch64/multiarch/memset_zva.S b/sysdeps/aarch64/multiarch/memset_zva.S
> new file mode 100644
> index 0000000..5d02b89
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_zva.S
> @@ -0,0 +1,41 @@
> +/* Memset for aarch64, ZVA enabled.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +# define MEMSET __memset_zva_64
> +# define MEMSET_ZVA 64
> +# define MEMSET_L(label) L(label ## _zva64)
> +# include <sysdeps/aarch64/memset.S>
> +
> +# undef MEMSET
> +# undef MEMSET_ZVA
> +# undef MEMSET_L
> +# define MEMSET __memset_zva_128
> +# define MEMSET_ZVA 128
> +# define MEMSET_L(label) L(label ## _zva128)
> +# include <sysdeps/aarch64/memset.S>
> +
> +# undef MEMSET
> +# undef MEMSET_ZVA
> +# undef MEMSET_L
> +# define MEMSET __memset_zva_default
> +# define MEMSET_ZVA 1
> +# define MEMSET_L(label) L(label ## _zvadef)
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> index e769eeb..092ee81 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> @@ -20,6 +20,9 @@
>  #include <sys/auxv.h>
>  #include <elf/dl-hwcaps.h>
>  
> +#define DCZID_DZP_MASK (1 << 4)
> +#define DCZID_BS_MASK (0xf)
> +
>  #if HAVE_TUNABLES
>  struct cpu_list
>  {
> @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>      }
>  
>    cpu_features->midr_el1 = midr;
> +
> +  /* Check if ZVA is enabled.  */
> +  unsigned dczid;
> +  asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
> +
> +  if ((dczid & DCZID_DZP_MASK) == 0)
> +    cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
>  }
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> index 73cb53d..f2b6afd 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> @@ -47,6 +47,7 @@
>  struct cpu_features
>  {
>    uint64_t midr_el1;
> +  unsigned zva_size;
>  };
>  
>  #endif /* _CPU_FEATURES_AARCH64_H  */

Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Siddhesh Poyarekar - Oct. 3, 2017, 11:51 a.m.
On Friday 29 September 2017 12:42 AM, Adhemerval Zanella wrote:
> I would use a more compact ChangeLog entry as:
> 
> 	* sysdeps/aarch64/memset.S (do_no_zva, do_zva_64,
> 	do_zva_128, do_zva_default): New macro.
> 
> Same for the other entries where it applies.

Done.

> You don't need use the default version for the loader, you can use the
> generic sysdeps/aarch64/memset.S by creating a rtld-memset.S on
> multiarch and defining the required macros.

Done.

> I would add a comment stating it is essentially doing libc_hidden_def(memset)
> and redirecting the internal implementation to __memset_generic.

Done.

> Although not strictly required, I think it should avoid build these
> for !IS_IN(libc) as for memset_zva.S. Same applied for memset_generic.S.

Done.

Posting updated patch shortly.

Thanks,
Siddhesh

Patch

========================================================================================================================
                  length=256, char=0:         1.82 (-87.26%)	       26.99 ( 89.30%)	       25.49 ( 78.76%)	       23.48 ( 64.65%)	       14.26
                  length=257, char=0:         1.82 (-87.29%)	       26.97 ( 88.44%)	       25.77 ( 80.12%)	       24.41 ( 70.57%)	       14.31
                  length=258, char=0:         1.82 (-87.38%)	       26.27 ( 82.29%)	       25.84 ( 79.28%)	       24.33 ( 68.80%)	       14.41
                  length=259, char=0:         1.82 (-87.36%)	       26.06 ( 81.15%)	       25.72 ( 78.84%)	       24.57 ( 70.80%)	       14.38
                  length=260, char=0:         1.82 (-87.44%)	       25.35 ( 75.23%)	       25.93 ( 79.23%)	       24.34 ( 68.24%)	       14.47
                  length=261, char=0:         1.82 (-87.49%)	       26.15 ( 79.70%)	       26.01 ( 78.72%)	       24.44 ( 67.97%)	       14.55
                  length=262, char=0:         1.82 (-87.54%)	       25.91 ( 77.31%)	       26.06 ( 78.35%)	       24.33 ( 66.49%)	       14.61
                  length=263, char=0:         1.82 (-87.54%)	       25.69 ( 75.80%)	       25.96 ( 77.63%)	       24.54 ( 67.90%)	       14.61
                  length=264, char=0:         1.82 (-87.57%)	       25.31 ( 72.69%)	       26.16 ( 78.43%)	       24.63 ( 68.00%)	       14.66
                  length=265, char=0:         1.82 (-87.65%)	       25.29 ( 71.35%)	       26.25 ( 77.84%)	       24.58 ( 66.53%)	       14.76
                  length=266, char=0:         1.82 (-87.69%)	       25.10 ( 69.40%)	       26.15 ( 76.48%)	       24.77 ( 67.22%)	       14.82
                  length=267, char=0:         1.82 (-87.69%)	       24.89 ( 68.02%)	       26.20 ( 76.90%)	       24.87 ( 67.91%)	       14.81
                  length=268, char=0:         1.82 (-87.74%)	       24.07 ( 62.04%)	       26.40 ( 77.74%)	       24.95 ( 67.93%)	       14.85
                  length=269, char=0:         1.82 (-87.80%)	       23.82 ( 59.29%)	       26.47 ( 77.00%)	       24.89 ( 66.43%)	       14.96
                  length=270, char=0:         1.82 (-87.84%)	       23.65 ( 57.61%)	       26.35 ( 75.58%)	       25.07 ( 67.07%)	       15.01
                  length=271, char=0:         1.83 (-87.82%)	       23.48 ( 56.53%)	       26.39 ( 75.93%)	       25.15 ( 67.66%)	       15.00
                  length=512, char=0:         1.90 (-92.59%)	       29.25 ( 13.81%)	       36.30 ( 41.27%)	       40.95 ( 59.36%)	       25.70
                  length=513, char=0:         1.90 (-92.57%)	       29.29 ( 14.35%)	       36.63 ( 43.01%)	       40.80 ( 59.28%)	       25.61
                  length=514, char=0:         1.90 (-92.62%)	       28.61 ( 10.91%)	       36.64 ( 42.05%)	       40.89 ( 58.52%)	       25.80
                  length=515, char=0:         1.90 (-92.63%)	       28.74 ( 11.29%)	       36.68 ( 42.06%)	       40.56 ( 57.08%)	       25.82
                  length=516, char=0:         1.90 (-92.65%)	       28.33 (  9.54%)	       36.72 ( 41.96%)	       40.09 ( 55.01%)	       25.87
                  length=517, char=0:         1.90 (-92.66%)	       28.41 (  9.60%)	       36.80 ( 41.97%)	       39.43 ( 52.13%)	       25.92
                  length=518, char=0:         1.90 (-92.66%)	       28.16 (  8.45%)	       36.84 ( 41.89%)	       39.40 ( 51.77%)	       25.96
                  length=519, char=0:         1.90 (-92.67%)	       28.21 (  8.58%)	       36.86 ( 41.86%)	       40.39 ( 55.46%)	       25.98
                  length=520, char=0:         1.90 (-92.65%)	       27.53 (  6.32%)	       36.90 ( 42.49%)	       40.80 ( 57.58%)	       25.89
                  length=521, char=0:         1.90 (-92.69%)	       27.53 (  5.65%)	       36.61 ( 40.50%)	       40.86 ( 56.81%)	       26.05
                  length=522, char=0:         1.90 (-92.66%)	       27.40 (  5.59%)	       36.95 ( 42.35%)	       40.92 ( 57.64%)	       25.95
                  length=523, char=0:         1.91 (-92.71%)	       27.50 (  5.29%)	       36.69 ( 40.45%)	       40.97 ( 56.84%)	       26.12
                  length=524, char=0:         1.90 (-92.69%)	       27.33 (  5.02%)	       37.00 ( 42.18%)	       40.98 ( 57.50%)	       26.02
                  length=525, char=0:         1.91 (-92.72%)	       27.24 (  4.04%)	       36.70 ( 40.13%)	       41.04 ( 56.72%)	       26.19
                  length=526, char=0:         1.90 (-92.70%)	       27.06 (  3.73%)	       37.06 ( 42.05%)	       41.08 ( 57.44%)	       26.09
                  length=527, char=0:         1.91 (-92.74%)	       26.82 (  2.17%)	       37.06 ( 41.17%)	       41.11 ( 56.62%)	       26.25
                 length=1024, char=0:         1.95 (-95.35%)	       30.55 (-27.12%)	       46.52 ( 10.99%)	       49.89 ( 19.02%)	       41.91
                 length=1025, char=0:         1.95 (-95.31%)	       30.58 (-26.47%)	       46.57 ( 11.98%)	       49.92 ( 20.05%)	       41.59
                 length=1026, char=0:         1.95 (-95.36%)	       30.35 (-27.70%)	       46.56 ( 10.92%)	       49.45 ( 17.79%)	       41.98
                 length=1027, char=0:         1.95 (-95.36%)	       30.24 (-28.02%)	       46.20 (  9.98%)	       49.93 ( 18.88%)	       42.00
                 length=1028, char=0:         1.95 (-95.37%)	       29.75 (-29.25%)	       46.58 ( 10.76%)	       49.92 ( 18.71%)	       42.05
                 length=1029, char=0:         1.95 (-95.37%)	       29.78 (-29.24%)	       46.57 ( 10.65%)	       49.96 ( 18.72%)	       42.08
                 length=1030, char=0:         1.95 (-95.33%)	       29.77 (-28.73%)	       46.63 ( 11.63%)	       49.97 ( 19.64%)	       41.77
                 length=1031, char=0:         1.95 (-95.37%)	       29.64 (-29.68%)	       46.62 ( 10.59%)	       49.51 ( 17.46%)	       42.15
                 length=1032, char=0:         1.95 (-95.38%)	       29.60 (-29.80%)	       46.22 (  9.63%)	       49.99 ( 18.58%)	       42.16
                 length=1033, char=0:         1.95 (-95.38%)	       29.32 (-30.55%)	       46.65 ( 10.49%)	       49.95 ( 18.32%)	       42.22
                 length=1034, char=0:         1.95 (-95.39%)	       29.45 (-30.31%)	       46.67 ( 10.44%)	       50.01 ( 18.36%)	       42.25
                 length=1035, char=0:         1.95 (-95.35%)	       29.31 (-30.09%)	       46.68 ( 11.34%)	       50.02 ( 19.31%)	       41.92
                 length=1036, char=0:         1.95 (-95.40%)	       29.30 (-30.75%)	       46.66 ( 10.27%)	       49.56 ( 17.12%)	       42.32
                 length=1037, char=0:         1.95 (-95.39%)	       29.17 (-31.08%)	       46.30 (  9.38%)	       50.04 ( 18.22%)	       42.33
                 length=1038, char=0:         1.95 (-95.40%)	       29.12 (-31.30%)	       46.71 ( 10.19%)	       50.02 ( 18.01%)	       42.39
                 length=1039, char=0:         1.95 (-95.40%)	       29.19 (-31.20%)	       46.73 ( 10.14%)	       50.06 ( 18.00%)	       42.43

	* sysdeps/aarch64/memset.S (do_no_zva): New macro.
	(do_zva_64): Likewise.
	(do_zva_128): Likewise.
	(do_zva_default): Likewise.
	(__memset): Rename to MEMSET macro.
	(MEMSET): Use the new macros.
	(MEMSET)[INTERNAL_MEMSET]: Retain old memset.
	(MEMSET)[!INTERNAL_MEMSET]: Remove zva check.
	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
	Add memset_generic, memset_nozva and memset_zva.
	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Add memset ifuncs.
	* sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
	static variable __aarch64_zva_size and local variable
	zva_size.
	* sysdeps/aarch64/multiarch/memset.c: New file.
	* sysdeps/aarch64/multiarch/memset_generic.S: New file.
	* sysdeps/aarch64/multiarch/memset_nozva.S: New file.
	* sysdeps/aarch64/multiarch/memset_zva.S: New file.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.c
	(DCZID_DZP_MASK): New macro.
	(DCZID_BS_MASK): Likewise.
	(init_cpu_features): Read and set zva_size.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h
	(struct cpu_features): New member zva_size.
---
 sysdeps/aarch64/memset.S                       | 248 +++++++++++++++----------
 sysdeps/aarch64/multiarch/Makefile             |   2 +-
 sysdeps/aarch64/multiarch/ifunc-impl-list.c    |   6 +
 sysdeps/aarch64/multiarch/init-arch.h          |   9 +-
 sysdeps/aarch64/multiarch/memset.c             |  47 +++++
 sysdeps/aarch64/multiarch/memset_generic.S     |  27 +++
 sysdeps/aarch64/multiarch/memset_nozva.S       |  22 +++
 sysdeps/aarch64/multiarch/memset_zva.S         |  41 ++++
 sysdeps/unix/sysv/linux/aarch64/cpu-features.c |  10 +
 sysdeps/unix/sysv/linux/aarch64/cpu-features.h |   1 +
 10 files changed, 313 insertions(+), 100 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/memset.c
 create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S
 create mode 100644 sysdeps/aarch64/multiarch/memset_nozva.S
 create mode 100644 sysdeps/aarch64/multiarch/memset_zva.S

diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 110fd22..8cff3a4 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -37,7 +37,108 @@ 
 #define zva_len x7
 #define zva_lenw w7
 
-ENTRY_ALIGN (__memset, 6)
+/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128
+   bytes and higher sizes.  */
+
+#ifndef ZVA_MACROS
+# define ZVA_MACROS
+/* No ZVA.  */
+.macro do_no_zva
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	add	dst, dst, 16
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+
+/* Write the first and last 64 byte aligned block using stp rather
+   than using DC ZVA.  This is faster on some cores.  */
+.macro do_zva_64
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+	nop
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+
+/* ZVA size of 128 bytes.  */
+.macro do_zva_128
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	bic	dst, dst, 127
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 128
+	subs	count, count, 128
+	b.hi	1b
+	stp	q0, q0, [dstend, -128]
+	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+
+/* ZVA size of more than 128 bytes.  */
+.macro do_zva_default
+	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
+	cmp	count, tmp1
+	blo	MEMSET_L(no_zva)
+
+	sub	tmp2, zva_len, 1
+	add	tmp1, dst, zva_len
+	add	dst, dst, 16
+	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
+	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
+	beq	2f
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	1b
+2:	mov	dst, tmp1
+	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
+	subs	count, count, zva_len
+	b.lo	4f
+3:	dc	zva, dst
+	add	dst, dst, zva_len
+	subs	count, count, zva_len
+	b.hs	3b
+4:	add	count, count, zva_len
+	subs	count, count, 64
+	b.ls	6f
+5:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	5b
+6:	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+#endif
+
+/* Memset entry point.  */
+ENTRY_ALIGN (MEMSET, 6)
 
 	DELOUSE (0)
 	DELOUSE (2)
@@ -46,9 +147,9 @@  ENTRY_ALIGN (__memset, 6)
 	add	dstend, dstin, count
 
 	cmp	count, 96
-	b.hi	L(set_long)
+	b.hi	MEMSET_L(set_long)
 	cmp	count, 16
-	b.hs	L(set_medium)
+	b.hs	MEMSET_L(set_medium)
 	mov	val, v0.D[0]
 
 	/* Set 0..15 bytes.  */
@@ -68,9 +169,9 @@  ENTRY_ALIGN (__memset, 6)
 3:	ret
 
 	/* Set 17..96 bytes.  */
-L(set_medium):
+MEMSET_L(set_medium):
 	str	q0, [dstin]
-	tbnz	count, 6, L(set96)
+	tbnz	count, 6, MEMSET_L(set96)
 	str	q0, [dstend, -16]
 	tbz	count, 5, 1f
 	str	q0, [dstin, 16]
@@ -80,7 +181,7 @@  L(set_medium):
 	.p2align 4
 	/* Set 64..96 bytes.  Write 64 bytes from the start and
 	   32 bytes from the end.  */
-L(set96):
+MEMSET_L(set96):
 	str	q0, [dstin, 16]
 	stp	q0, q0, [dstin, 32]
 	stp	q0, q0, [dstend, -32]
@@ -88,108 +189,63 @@  L(set96):
 
 	.p2align 3
 	nop
-L(set_long):
+MEMSET_L(set_long):
+#ifdef INTERNAL_MEMSET
 	and	valw, valw, 255
 	bic	dst, dstin, 15
 	str	q0, [dstin]
 	cmp	count, 256
 	ccmp	valw, 0, 0, cs
-	b.eq	L(try_zva)
-L(no_zva):
-	sub	count, dstend, dst	/* Count is 16 too large.  */
-	add	dst, dst, 16
-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-L(tail64):
-	subs	count, count, 64
-	b.hi	1b
-2:	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
+	b.eq	MEMSET_L(try_zva)
 
-	.p2align 3
-L(try_zva):
+MEMSET_L(no_zva):
+	do_no_zva
+
+	.p2align 4
+MEMSET_L(try_zva):
 	mrs	tmp1, dczid_el0
-	tbnz	tmp1w, 4, L(no_zva)
 	and	tmp1w, tmp1w, 15
 	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
-	b.ne	 L(zva_128)
+	b.ne	 MEMSET_L(zva_128)
+	do_zva_64
 
-	/* Write the first and last 64 byte aligned block using stp rather
-	   than using DC ZVA.  This is faster on some cores.
-	 */
-L(zva_64):
-	str	q0, [dst, 16]
-	stp	q0, q0, [dst, 32]
-	bic	dst, dst, 63
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-	nop
-1:	dc	zva, dst
-	add	dst, dst, 64
-	subs	count, count, 64
-	b.hi	1b
-	stp	q0, q0, [dst, 0]
-	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
-
-	.p2align 3
-L(zva_128):
+MEMSET_L(zva_128):
 	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
-	b.ne	L(zva_other)
+	b.ne	MEMSET_L(zva_other)
+	do_zva_128
 
-	str	q0, [dst, 16]
-	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	bic	dst, dst, 127
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
-	add	dst, dst, 128
-	subs	count, count, 128
-	b.hi	1b
-	stp	q0, q0, [dstend, -128]
-	stp	q0, q0, [dstend, -96]
-	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
-
-L(zva_other):
+MEMSET_L(zva_other):
 	mov	tmp2w, 4
 	lsl	zva_lenw, tmp2w, tmp1w
-	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
-	cmp	count, tmp1
-	blo	L(no_zva)
+	do_zva_default
+#else
+	/* Memset called through PLT, so we need only one of the ZVA
+	   variants.  */
+# ifdef MEMSET_ZVA
+	and	valw, valw, 255
+# endif
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+# ifdef MEMSET_ZVA
+	cmp	count, 256
+	ccmp	valw, 0, 0, cs
+	b.eq	MEMSET_L(try_zva)
+# endif
+MEMSET_L(no_zva):
+	do_no_zva
+# if defined MEMSET_ZVA
+MEMSET_L(try_zva):
+#  if MEMSET_ZVA == 64
+	do_zva_64
+#  elif MEMSET_ZVA == 128
+	do_zva_128
+#  else
+	adrp	zva_len, __aarch64_zva_size
+	ldr	zva_len, [zva_len, #:lo12:__aarch64_zva_size]
+	do_zva_default
+#  endif
+# endif
+#endif
 
-	sub	tmp2, zva_len, 1
-	add	tmp1, dst, zva_len
-	add	dst, dst, 16
-	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
-	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
-	beq	2f
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-	subs	count, count, 64
-	b.hi	1b
-2:	mov	dst, tmp1
-	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
-	subs	count, count, zva_len
-	b.lo	4f
-3:	dc	zva, dst
-	add	dst, dst, zva_len
-	subs	count, count, zva_len
-	b.hs	3b
-4:	add	count, count, zva_len
-	b	L(tail64)
-
-END (__memset)
-weak_alias (__memset, memset)
-libc_hidden_builtin_def (memset)
+END (MEMSET)
+libc_hidden_builtin_def (MEMSET)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 9aa1e79..f611182 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,4 +1,4 @@ 
 ifeq ($(subdir),string)
 sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
-		   memmove_falkor
+		   memmove_falkor memset_generic memset_nozva memset_zva
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 2cb74d5..29148ac 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -46,6 +46,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+  IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva)
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64)
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128)
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size > 0), __memset_zva_default)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
 
   return i;
 }
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
index 3af442c..541c27e 100644
--- a/sysdeps/aarch64/multiarch/init-arch.h
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -18,6 +18,9 @@ 
 
 #include <ldsodefs.h>
 
-#define INIT_ARCH()				\
-  uint64_t __attribute__((unused)) midr =	\
-    GLRO(dl_aarch64_cpu_features).midr_el1;
+#define INIT_ARCH()							      \
+  uint64_t __attribute__((unused)) midr =				      \
+    GLRO(dl_aarch64_cpu_features).midr_el1;				      \
+  extern unsigned __aarch64_zva_size;					      \
+  unsigned __attribute__((unused)) zva_size = __aarch64_zva_size =	      \
+    GLRO(dl_aarch64_cpu_features).zva_size;
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
new file mode 100644
index 0000000..58e669a
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -0,0 +1,47 @@ 
+/* Multiple versions of memset. AARCH64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memset so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memset
+# define memset __redirect_memset
+# include <string.h>
+# include <init-arch.h>
+
+unsigned __aarch64_zva_size;
+
+extern __typeof (__redirect_memset) __libc_memset;
+
+extern __typeof (__redirect_memset) __memset_nozva attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva_default attribute_hidden;
+
+libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva
+			    : (zva_size == 64 ? __memset_zva_64
+			       : (zva_size == 128 ? __memset_zva_128
+				  : __memset_zva_default))));
+
+# undef memset
+strong_alias (__libc_memset, memset);
+#else
+#include <string/memset.c>
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
new file mode 100644
index 0000000..56f1e02
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_generic.S
@@ -0,0 +1,27 @@ 
+/* Memset for aarch64, default version for internal use.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define MEMSET __memset_generic
+#define INTERNAL_MEMSET
+#define MEMSET_L(label) L(label)
+#ifdef SHARED
+	.globl __GI_memset; __GI_memset = __memset_generic
+#endif
+
+#include <sysdeps/aarch64/memset.S>
diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S
new file mode 100644
index 0000000..98045ac
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_nozva.S
@@ -0,0 +1,22 @@ 
+/* Memset for aarch64, ZVA disabled.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define MEMSET __memset_nozva
+#define MEMSET_L(label) L(label)
+#include <sysdeps/aarch64/memset.S>
diff --git a/sysdeps/aarch64/multiarch/memset_zva.S b/sysdeps/aarch64/multiarch/memset_zva.S
new file mode 100644
index 0000000..5d02b89
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_zva.S
@@ -0,0 +1,41 @@ 
+/* Memset for aarch64, ZVA enabled.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMSET __memset_zva_64
+# define MEMSET_ZVA 64
+# define MEMSET_L(label) L(label ## _zva64)
+# include <sysdeps/aarch64/memset.S>
+
+# undef MEMSET
+# undef MEMSET_ZVA
+# undef MEMSET_L
+# define MEMSET __memset_zva_128
+# define MEMSET_ZVA 128
+# define MEMSET_L(label) L(label ## _zva128)
+# include <sysdeps/aarch64/memset.S>
+
+# undef MEMSET
+# undef MEMSET_ZVA
+# undef MEMSET_L
+# define MEMSET __memset_zva_default
+# define MEMSET_ZVA 1
+# define MEMSET_L(label) L(label ## _zvadef)
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
index e769eeb..092ee81 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
@@ -20,6 +20,9 @@ 
 #include <sys/auxv.h>
 #include <elf/dl-hwcaps.h>
 
+#define DCZID_DZP_MASK (1 << 4)
+#define DCZID_BS_MASK (0xf)
+
 #if HAVE_TUNABLES
 struct cpu_list
 {
@@ -72,4 +75,11 @@  init_cpu_features (struct cpu_features *cpu_features)
     }
 
   cpu_features->midr_el1 = midr;
+
+  /* Check if ZVA is enabled.  */
+  unsigned dczid;
+  asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
+
+  if ((dczid & DCZID_DZP_MASK) == 0)
+    cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
 }
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index 73cb53d..f2b6afd 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -47,6 +47,7 @@ 
 struct cpu_features
 {
   uint64_t midr_el1;
+  unsigned zva_size;
 };
 
 #endif /* _CPU_FEATURES_AARCH64_H  */