Patchwork [PATCHv3,1/2] aarch64: Hoist ZVA check out of the memset function

login
register
mail settings
Submitter Siddhesh Poyarekar
Date Oct. 5, 2017, 5:16 p.m.
Message ID <1507223795-4893-1-git-send-email-siddhesh@sourceware.org>
Download mbox | patch
Permalink /patch/23358/
State New
Headers show

Comments

Siddhesh Poyarekar - Oct. 5, 2017, 5:16 p.m.
The DZP bit in the dczid_el0 register does not change dynamically, so
it is safe to read once during program startup.  Hoist the zva check
into an ifunc resolver and store the result into a static variable,
which can be read in case of non-standard zva sizes.  This effectively
adds 3 ifunc variants for memset - one for cases where zva is
disabled, one for 64 byte zva and another for 128 byte zva.  I have
retained the older memset as __memset_generic for internal libc.so use
so that the change impact is minimal.  We should eventually have a
discussion on what is more expensive, reading dczid_el0 on every
memset invocation or the indirection due to PLT.

The gains due to this are significant for falkor, with run time
reductions as high as 42% in some cases.  Likewise for mustang,
although the numbers are slightly lower.  Here's a sample from the
falkor tests:

Function: memset
Variant: walk
                                    simple_memset	__memset_nozva	__memset_zva_64	__memset_generic
Siddhesh Poyarekar - Oct. 10, 2017, 10:05 a.m.
Ping!

On Thursday 05 October 2017 10:46 PM, Siddhesh Poyarekar wrote:
> The DZP bit in the dczid_el0 register does not change dynamically, so
> it is safe to read once during program startup.  Hoist the zva check
> into an ifunc resolver and store the result into a static variable,
> which can be read in case of non-standard zva sizes.  This effectively
> adds 3 ifunc variants for memset - one for cases where zva is
> disabled, one for 64 byte zva and another for 128 byte zva.  I have
> retained the older memset as __memset_generic for internal libc.so use
> so that the change impact is minimal.  We should eventually have a
> discussion on what is more expensive, reading dczid_el0 on every
> memset invocation or the indirection due to PLT.
> 
> The gains due to this are significant for falkor, with run time
> reductions as high as 42% in some cases.  Likewise for mustang,
> although the numbers are slightly lower.  Here's a sample from the
> falkor tests:
> 
> Function: memset
> Variant: walk
>                                     simple_memset	__memset_nozva	__memset_zva_64	__memset_generic
> ========================================================================================================================
>                   length=256, char=0:     35936.10 (-706.66%)	     2429.88 ( 45.46%)	     2571.85 ( 42.27%)	     4454.92
>                   length=257, char=0:     36209.50 (-710.17%)	     2436.12 ( 45.49%)	     2564.25 ( 42.63%)	     4469.36
>                   length=258, char=0:     36507.90 (-710.21%)	     2522.06 ( 44.03%)	     2578.89 ( 42.77%)	     4505.99
>                   length=259, char=0:     36764.30 (-711.99%)	     2611.61 ( 42.32%)	     2593.52 ( 42.72%)	     4527.69
>                   length=260, char=0:     36943.30 (-712.62%)	     2639.06 ( 41.95%)	     2608.24 ( 42.63%)	     4546.19
>                   length=261, char=0:     37287.50 (-717.27%)	     2623.07 ( 42.51%)	     2623.17 ( 42.51%)	     4562.47
>                   length=262, char=0:     37573.70 (-722.44%)	     2665.51 ( 41.66%)	     2637.28 ( 42.27%)	     4568.56
>                   length=263, char=0:     37833.70 (-724.30%)	     2692.70 ( 41.33%)	     2668.38 ( 41.86%)	     4589.79
>                   length=264, char=0:     38136.00 (-727.49%)	     2737.30 ( 40.61%)	     2685.48 ( 41.73%)	     4608.66
>                   length=265, char=0:     38403.10 (-730.30%)	     2778.70 ( 39.92%)	     2695.10 ( 41.73%)	     4625.23
>                   length=266, char=0:     38684.50 (-729.88%)	     2822.16 ( 39.46%)	     2692.91 ( 42.23%)	     4661.47
>                   length=267, char=0:     38954.10 (-732.30%)	     2867.41 ( 38.73%)	     2706.28 ( 42.18%)	     4680.31
>                   length=268, char=0:     39155.00 (-733.08%)	     2968.76 ( 36.84%)	     2721.89 ( 42.09%)	     4700.03
>                   length=269, char=0:     39559.30 (-737.49%)	     3057.49 ( 35.27%)	     2737.61 ( 42.04%)	     4723.54
>                   length=270, char=0:     39813.80 (-742.51%)	     3073.64 ( 34.96%)	     2751.70 ( 41.77%)	     4725.60
>                   length=271, char=0:     40070.60 (-744.40%)	     3103.55 ( 34.60%)	     2784.25 ( 41.33%)	     4745.43
>                   length=512, char=0:    137515.00 (-1275.48%)	     8971.95 ( 10.26%)	     7168.66 ( 28.30%)	     9997.61
>                   length=513, char=0:    138015.00 (-1284.40%)	     8987.07 (  9.85%)	     7242.59 ( 27.35%)	     9969.29
>                   length=514, char=0:    138556.00 (-1286.76%)	     9200.17 (  7.92%)	     7211.49 ( 27.82%)	     9991.38
>                   length=515, char=0:    139182.00 (-1277.21%)	     9223.64 (  8.73%)	     7232.78 ( 28.43%)	    10106.10
>                   length=516, char=0:    139512.00 (-1288.41%)	     9306.80 (  7.38%)	     7312.15 ( 27.23%)	    10048.30
>                   length=517, char=0:    140117.00 (-1292.65%)	     9429.22 (  6.28%)	     7273.52 ( 27.71%)	    10061.20
>                   length=518, char=0:    140706.00 (-1294.63%)	     9463.83 (  6.20%)	     7292.57 ( 27.72%)	    10089.10
>                   length=519, char=0:    141221.00 (-1289.12%)	     9548.99 (  6.07%)	     7312.75 ( 28.07%)	    10166.20
>                   length=520, char=0:    141696.00 (-1297.00%)	     9713.49 (  4.27%)	     7386.44 ( 27.21%)	    10147.00
>                   length=521, char=0:    142309.00 (-1298.82%)	     9888.41 (  2.80%)	     7361.91 ( 27.64%)	    10173.50
>                   length=522, char=0:    142878.00 (-1292.34%)	     9909.30 (  3.43%)	     7381.22 ( 28.07%)	    10261.70
>                   length=523, char=0:    143327.00 (-1300.69%)	     9918.78 (  3.07%)	     7462.93 ( 27.07%)	    10232.60
>                   length=524, char=0:    143776.00 (-1301.67%)	    10055.40 (  1.97%)	     7428.56 ( 27.58%)	    10257.50
>                   length=525, char=0:    144429.00 (-1296.79%)	    10090.80 (  2.41%)	     7449.84 ( 27.95%)	    10340.10
>                   length=526, char=0:    144976.00 (-1305.05%)	    10178.80 (  1.35%)	     7530.66 ( 27.02%)	    10318.20
>                   length=527, char=0:    145551.00 (-1306.63%)	    10314.40 (  0.32%)	     7498.48 ( 27.53%)	    10347.50
>                  length=1024, char=0:    537600.00 (-2116.32%)	    34541.10 (-42.40%)	    22541.00 (  7.07%)	    24256.40
>                  length=1025, char=0:    538490.00 (-2117.66%)	    34560.10 (-42.33%)	    22574.10 (  7.03%)	    24281.90
>                  length=1026, char=0:    539596.00 (-2118.30%)	    34869.20 (-43.35%)	    22615.10 (  7.03%)	    24324.70
>                  length=1027, char=0:    540544.00 (-2118.30%)	    35020.60 (-43.72%)	    22654.60 (  7.03%)	    24367.50
>                  length=1028, char=0:    541355.00 (-2119.44%)	    35407.20 (-45.16%)	    22702.00 (  6.93%)	    24391.50
>                  length=1029, char=0:    542678.00 (-2121.52%)	    35806.10 (-46.58%)	    22751.10 (  6.87%)	    24428.20
>                  length=1030, char=0:    543843.00 (-2122.73%)	    35761.20 (-46.16%)	    22771.20 (  6.93%)	    24467.30
>                  length=1031, char=0:    544725.00 (-2123.15%)	    35927.70 (-46.63%)	    22814.10 (  6.89%)	    24502.40
>                  length=1032, char=0:    545744.00 (-2124.10%)	    35882.10 (-46.23%)	    22844.50 (  6.90%)	    24537.70
>                  length=1033, char=0:    546968.00 (-2125.25%)	    36080.00 (-46.79%)	    22885.20 (  6.90%)	    24580.10
>                  length=1034, char=0:    548042.00 (-2126.35%)	    36208.30 (-47.09%)	    22922.90 (  6.88%)	    24616.20
>                  length=1035, char=0:    549066.00 (-2127.30%)	    36398.80 (-47.65%)	    22961.30 (  6.86%)	    24651.60
>                  length=1036, char=0:    550138.00 (-2127.95%)	    36558.40 (-48.05%)	    23008.70 (  6.82%)	    24692.60
>                  length=1037, char=0:    551170.00 (-2129.86%)	    36732.90 (-48.61%)	    23043.40 (  6.77%)	    24717.70
>                  length=1038, char=0:    552268.00 (-2130.95%)	    36722.80 (-48.35%)	    23078.80 (  6.77%)	    24754.80
>                  length=1039, char=0:    553270.00 (-2131.58%)	    36891.60 (-48.80%)	    23116.80 (  6.76%)	    24792.80
> 
> 	* sysdeps/aarch64/memset.S (do_no_zva): New macro.
> 	(do_zva_64): Likewise.
> 	(do_zva_128): Likewise.
> 	(__memset): Rename to MEMSET macro.
> 	(MEMSET): Use the new macros.
> 	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
> 	Add memset_generic, memset_nozva, memset_zva_64,
> 	memset_zva_128 and memset_generic.
> 	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
> 	(__libc_ifunc_impl_list): Add memset ifuncs.
> 	* sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
> 	local variable zva_size.
> 	* sysdeps/aarch64/multiarch/memset.c: New file.
> 	* sysdeps/aarch64/multiarch/memset_generic.S: New file.
> 	* sysdeps/aarch64/multiarch/memset_nozva.S: New file.
> 	* sysdeps/aarch64/multiarch/memset_zva_64.S: New file.
> 	* sysdeps/aarch64/multiarch/memset_zva_128.S: New file.
> 	* sysdeps/aarch64/multiarch/rtld-memset.S: New file.
> 	* sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> 	(DCZID_DZP_MASK): New macro.
> 	(DCZID_BS_MASK): Likewise.
> 	(init_cpu_features): Read and set zva_size.
> 	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> 	(struct cpu_features): New member zva_size.
> ---
>  sysdeps/aarch64/memset.S                       | 200 +++++++++++++++----------
>  sysdeps/aarch64/multiarch/Makefile             |   3 +-
>  sysdeps/aarch64/multiarch/ifunc-impl-list.c    |   5 +
>  sysdeps/aarch64/multiarch/init-arch.h          |   8 +-
>  sysdeps/aarch64/multiarch/memset.c             |  43 ++++++
>  sysdeps/aarch64/multiarch/memset_generic.S     |  28 ++++
>  sysdeps/aarch64/multiarch/memset_nozva.S       |  23 +++
>  sysdeps/aarch64/multiarch/memset_zva_128.S     |  24 +++
>  sysdeps/aarch64/multiarch/memset_zva_64.S      |  24 +++
>  sysdeps/aarch64/multiarch/rtld-memset.S        |  24 +++
>  sysdeps/unix/sysv/linux/aarch64/cpu-features.c |  10 ++
>  sysdeps/unix/sysv/linux/aarch64/cpu-features.h |   1 +
>  12 files changed, 307 insertions(+), 86 deletions(-)
>  create mode 100644 sysdeps/aarch64/multiarch/memset.c
>  create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S
>  create mode 100644 sysdeps/aarch64/multiarch/memset_nozva.S
>  create mode 100644 sysdeps/aarch64/multiarch/memset_zva_128.S
>  create mode 100644 sysdeps/aarch64/multiarch/memset_zva_64.S
>  create mode 100644 sysdeps/aarch64/multiarch/rtld-memset.S
> 
> diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
> index 110fd22..9fea4c2 100644
> --- a/sysdeps/aarch64/memset.S
> +++ b/sysdeps/aarch64/memset.S
> @@ -37,7 +37,105 @@
>  #define zva_len x7
>  #define zva_lenw w7
>  
> -ENTRY_ALIGN (__memset, 6)
> +/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128
> +   bytes and higher sizes.  */
> +
> +/* No ZVA.  */
> +.macro do_no_zva
> +	sub	count, dstend, dst	/* Count is 16 too large.  */
> +	add	dst, dst, 16
> +	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
> +1:	stp	q0, q0, [dst], 64
> +	stp	q0, q0, [dst, -32]
> +	subs	count, count, 64
> +	b.hi	1b
> +	stp	q0, q0, [dstend, -64]
> +	stp	q0, q0, [dstend, -32]
> +	ret
> +.endm
> +
> +/* Write the first and last 64 byte aligned block using stp rather
> +   than using DC ZVA.  This is faster on some cores.  */
> +.macro do_zva_64
> +	str	q0, [dst, 16]
> +	stp	q0, q0, [dst, 32]
> +	bic	dst, dst, 63
> +	stp	q0, q0, [dst, 64]
> +	stp	q0, q0, [dst, 96]
> +	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> +	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
> +	add	dst, dst, 128
> +	nop
> +1:	dc	zva, dst
> +	add	dst, dst, 64
> +	subs	count, count, 64
> +	b.hi	1b
> +	stp	q0, q0, [dst, 0]
> +	stp	q0, q0, [dst, 32]
> +	stp	q0, q0, [dstend, -64]
> +	stp	q0, q0, [dstend, -32]
> +	ret
> +.endm
> +
> +/* ZVA size of 128 bytes.  */
> +.macro do_zva_128
> +	str	q0, [dst, 16]
> +	stp	q0, q0, [dst, 32]
> +	stp	q0, q0, [dst, 64]
> +	stp	q0, q0, [dst, 96]
> +	bic	dst, dst, 127
> +	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> +	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
> +	add	dst, dst, 128
> +1:	dc	zva, dst
> +	add	dst, dst, 128
> +	subs	count, count, 128
> +	b.hi	1b
> +	stp	q0, q0, [dstend, -128]
> +	stp	q0, q0, [dstend, -96]
> +	stp	q0, q0, [dstend, -64]
> +	stp	q0, q0, [dstend, -32]
> +	ret
> +.endm
> +
> +/* ZVA size of more than 128 bytes.  */
> +.macro do_zva_default
> +	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
> +	cmp	count, tmp1
> +	blo	L(no_zva)
> +
> +	sub	tmp2, zva_len, 1
> +	add	tmp1, dst, zva_len
> +	add	dst, dst, 16
> +	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
> +	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
> +	beq	2f
> +1:	stp	q0, q0, [dst], 64
> +	stp	q0, q0, [dst, -32]
> +	subs	count, count, 64
> +	b.hi	1b
> +2:	mov	dst, tmp1
> +	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
> +	subs	count, count, zva_len
> +	b.lo	4f
> +3:	dc	zva, dst
> +	add	dst, dst, zva_len
> +	subs	count, count, zva_len
> +	b.hs	3b
> +4:	add	count, count, zva_len
> +	subs	count, count, 64
> +	b.ls	6f
> +5:	stp	q0, q0, [dst], 64
> +	stp	q0, q0, [dst, -32]
> +	subs	count, count, 64
> +	b.hi	5b
> +6:	stp	q0, q0, [dstend, -64]
> +	stp	q0, q0, [dstend, -32]
> +	ret
> +.endm
> +
> +/* Memset entry point.  */
> +ENTRY_ALIGN (MEMSET, 6)
>  
>  	DELOUSE (0)
>  	DELOUSE (2)
> @@ -89,107 +187,45 @@ L(set96):
>  	.p2align 3
>  	nop
>  L(set_long):
> +#ifdef MEMSET_ZVA
>  	and	valw, valw, 255
> +#endif
>  	bic	dst, dstin, 15
>  	str	q0, [dstin]
> +#ifdef MEMSET_ZVA
>  	cmp	count, 256
>  	ccmp	valw, 0, 0, cs
>  	b.eq	L(try_zva)
> +#endif
>  L(no_zva):
> -	sub	count, dstend, dst	/* Count is 16 too large.  */
> -	add	dst, dst, 16
> -	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
> -1:	stp	q0, q0, [dst], 64
> -	stp	q0, q0, [dst, -32]
> -L(tail64):
> -	subs	count, count, 64
> -	b.hi	1b
> -2:	stp	q0, q0, [dstend, -64]
> -	stp	q0, q0, [dstend, -32]
> -	ret
> +	do_no_zva
>  
> -	.p2align 3
> +#ifdef MEMSET_ZVA
> +	.p2align 4
>  L(try_zva):
> +# if MEMSET_ZVA == 64
> +	do_zva_64
> +# elif MEMSET_ZVA == 128
> +	do_zva_128
> +# else
>  	mrs	tmp1, dczid_el0
>  	tbnz	tmp1w, 4, L(no_zva)
>  	and	tmp1w, tmp1w, 15
>  	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
>  	b.ne	 L(zva_128)
> +	do_zva_64
>  
> -	/* Write the first and last 64 byte aligned block using stp rather
> -	   than using DC ZVA.  This is faster on some cores.
> -	 */
> -L(zva_64):
> -	str	q0, [dst, 16]
> -	stp	q0, q0, [dst, 32]
> -	bic	dst, dst, 63
> -	stp	q0, q0, [dst, 64]
> -	stp	q0, q0, [dst, 96]
> -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> -	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
> -	add	dst, dst, 128
> -	nop
> -1:	dc	zva, dst
> -	add	dst, dst, 64
> -	subs	count, count, 64
> -	b.hi	1b
> -	stp	q0, q0, [dst, 0]
> -	stp	q0, q0, [dst, 32]
> -	stp	q0, q0, [dstend, -64]
> -	stp	q0, q0, [dstend, -32]
> -	ret
> -
> -	.p2align 3
>  L(zva_128):
>  	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
>  	b.ne	L(zva_other)
> -
> -	str	q0, [dst, 16]
> -	stp	q0, q0, [dst, 32]
> -	stp	q0, q0, [dst, 64]
> -	stp	q0, q0, [dst, 96]
> -	bic	dst, dst, 127
> -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> -	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
> -	add	dst, dst, 128
> -1:	dc	zva, dst
> -	add	dst, dst, 128
> -	subs	count, count, 128
> -	b.hi	1b
> -	stp	q0, q0, [dstend, -128]
> -	stp	q0, q0, [dstend, -96]
> -	stp	q0, q0, [dstend, -64]
> -	stp	q0, q0, [dstend, -32]
> -	ret
> +	do_zva_128
>  
>  L(zva_other):
>  	mov	tmp2w, 4
>  	lsl	zva_lenw, tmp2w, tmp1w
> -	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
> -	cmp	count, tmp1
> -	blo	L(no_zva)
> +	do_zva_default
> +# endif
> +#endif
>  
> -	sub	tmp2, zva_len, 1
> -	add	tmp1, dst, zva_len
> -	add	dst, dst, 16
> -	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
> -	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
> -	beq	2f
> -1:	stp	q0, q0, [dst], 64
> -	stp	q0, q0, [dst, -32]
> -	subs	count, count, 64
> -	b.hi	1b
> -2:	mov	dst, tmp1
> -	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
> -	subs	count, count, zva_len
> -	b.lo	4f
> -3:	dc	zva, dst
> -	add	dst, dst, zva_len
> -	subs	count, count, zva_len
> -	b.hs	3b
> -4:	add	count, count, zva_len
> -	b	L(tail64)
> -
> -END (__memset)
> -weak_alias (__memset, memset)
> -libc_hidden_builtin_def (memset)
> +END (MEMSET)
> +libc_hidden_builtin_def (MEMSET)
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index 9aa1e79..c1e17e8 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -1,4 +1,5 @@
>  ifeq ($(subdir),string)
>  sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
> -		   memmove_falkor
> +		   memmove_falkor memset_generic memset_nozva memset_zva_64 \
> +		   memset_zva_128
>  endif
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 2cb74d5..fb695ce 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -46,6 +46,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
> +  IFUNC_IMPL (i, name, memset,
> +	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva)
> +	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64)
> +	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128)
> +	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
>  
>    return i;
>  }
> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> index 3af442c..a756dad 100644
> --- a/sysdeps/aarch64/multiarch/init-arch.h
> +++ b/sysdeps/aarch64/multiarch/init-arch.h
> @@ -18,6 +18,8 @@
>  
>  #include <ldsodefs.h>
>  
> -#define INIT_ARCH()				\
> -  uint64_t __attribute__((unused)) midr =	\
> -    GLRO(dl_aarch64_cpu_features).midr_el1;
> +#define INIT_ARCH()							      \
> +  uint64_t __attribute__((unused)) midr =				      \
> +    GLRO(dl_aarch64_cpu_features).midr_el1;				      \
> +  unsigned __attribute__((unused)) zva_size =				      \
> +    GLRO(dl_aarch64_cpu_features).zva_size;
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> new file mode 100644
> index 0000000..a7e34c0
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -0,0 +1,43 @@
> +/* Multiple versions of memset. AARCH64 version.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +
> +#if IS_IN (libc)
> +/* Redefine memset so that the compiler won't complain about the type
> +   mismatch with the IFUNC selector in strong_alias, below.  */
> +# undef memset
> +# define memset __redirect_memset
> +# include <string.h>
> +# include <init-arch.h>
> +
> +extern __typeof (__redirect_memset) __libc_memset;
> +
> +extern __typeof (__redirect_memset) __memset_nozva attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
> +
> +libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva
> +			    : (zva_size == 64 ? __memset_zva_64
> +			       : (zva_size == 128 ? __memset_zva_128
> +				  : __memset_generic))));
> +
> +# undef memset
> +strong_alias (__libc_memset, memset);
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
> new file mode 100644
> index 0000000..8871600
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_generic.S
> @@ -0,0 +1,28 @@
> +/* Memset for aarch64, default version for internal use.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +# define MEMSET __memset_generic
> +# define MEMSET_ZVA 1
> +/* Add a hidden definition for use within libc.so.  */
> +# ifdef SHARED
> +	.globl __GI_memset; __GI_memset = __memset_generic
> +# endif
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S
> new file mode 100644
> index 0000000..2d4fc42
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_nozva.S
> @@ -0,0 +1,23 @@
> +/* Memset for aarch64, ZVA disabled.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +# define MEMSET __memset_nozva
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/memset_zva_128.S b/sysdeps/aarch64/multiarch/memset_zva_128.S
> new file mode 100644
> index 0000000..2c68127
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_zva_128.S
> @@ -0,0 +1,24 @@
> +/* Memset for aarch64, ZVA enabled and == 128 bytes.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +# define MEMSET __memset_zva_128
> +# define MEMSET_ZVA 128
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/memset_zva_64.S b/sysdeps/aarch64/multiarch/memset_zva_64.S
> new file mode 100644
> index 0000000..ff895f9
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_zva_64.S
> @@ -0,0 +1,24 @@
> +/* Memset for aarch64, ZVA enabled and == 64 bytes.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +# define MEMSET __memset_zva_64
> +# define MEMSET_ZVA 64
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
> new file mode 100644
> index 0000000..172df42
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/rtld-memset.S
> @@ -0,0 +1,24 @@
> +/* Memset for aarch64, for the dynamic linker.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (rtld)
> +# define MEMSET memset
> +# define MEMSET_ZVA 1
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> index e769eeb..092ee81 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> @@ -20,6 +20,9 @@
>  #include <sys/auxv.h>
>  #include <elf/dl-hwcaps.h>
>  
> +#define DCZID_DZP_MASK (1 << 4)
> +#define DCZID_BS_MASK (0xf)
> +
>  #if HAVE_TUNABLES
>  struct cpu_list
>  {
> @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>      }
>  
>    cpu_features->midr_el1 = midr;
> +
> +  /* Check if ZVA is enabled.  */
> +  unsigned dczid;
> +  asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
> +
> +  if ((dczid & DCZID_DZP_MASK) == 0)
> +    cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
>  }
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> index 73cb53d..f2b6afd 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> @@ -47,6 +47,7 @@
>  struct cpu_features
>  {
>    uint64_t midr_el1;
> +  unsigned zva_size;
>  };
>  
>  #endif /* _CPU_FEATURES_AARCH64_H  */
>
Adhemerval Zanella Netto - Oct. 11, 2017, 7:57 p.m.
LGTM, for the pointer brought by Szabolcs:

  - I still prefer to have the macros on one file instead of multiple ones.
    It makes checks for all the possible code patch easier and one can
    theoretical build the desirable memset by just define the required
    macros instead of pulling from source code from different files.

On 10/10/2017 07:05, Siddhesh Poyarekar wrote:
> Ping!
> 
> On Thursday 05 October 2017 10:46 PM, Siddhesh Poyarekar wrote:
>> The DZP bit in the dczid_el0 register does not change dynamically, so
>> it is safe to read once during program startup.  Hoist the zva check
>> into an ifunc resolver and store the result into a static variable,
>> which can be read in case of non-standard zva sizes.  This effectively
>> adds 3 ifunc variants for memset - one for cases where zva is
>> disabled, one for 64 byte zva and another for 128 byte zva.  I have
>> retained the older memset as __memset_generic for internal libc.so use
>> so that the change impact is minimal.  We should eventually have a
>> discussion on what is more expensive, reading dczid_el0 on every
>> memset invocation or the indirection due to PLT.
>>
>> The gains due to this are significant for falkor, with run time
>> reductions as high as 42% in some cases.  Likewise for mustang,
>> although the numbers are slightly lower.  Here's a sample from the
>> falkor tests:
>>
>> Function: memset
>> Variant: walk
>>                                     simple_memset	__memset_nozva	__memset_zva_64	__memset_generic
>> ========================================================================================================================
>>                   length=256, char=0:     35936.10 (-706.66%)	     2429.88 ( 45.46%)	     2571.85 ( 42.27%)	     4454.92
>>                   length=257, char=0:     36209.50 (-710.17%)	     2436.12 ( 45.49%)	     2564.25 ( 42.63%)	     4469.36
>>                   length=258, char=0:     36507.90 (-710.21%)	     2522.06 ( 44.03%)	     2578.89 ( 42.77%)	     4505.99
>>                   length=259, char=0:     36764.30 (-711.99%)	     2611.61 ( 42.32%)	     2593.52 ( 42.72%)	     4527.69
>>                   length=260, char=0:     36943.30 (-712.62%)	     2639.06 ( 41.95%)	     2608.24 ( 42.63%)	     4546.19
>>                   length=261, char=0:     37287.50 (-717.27%)	     2623.07 ( 42.51%)	     2623.17 ( 42.51%)	     4562.47
>>                   length=262, char=0:     37573.70 (-722.44%)	     2665.51 ( 41.66%)	     2637.28 ( 42.27%)	     4568.56
>>                   length=263, char=0:     37833.70 (-724.30%)	     2692.70 ( 41.33%)	     2668.38 ( 41.86%)	     4589.79
>>                   length=264, char=0:     38136.00 (-727.49%)	     2737.30 ( 40.61%)	     2685.48 ( 41.73%)	     4608.66
>>                   length=265, char=0:     38403.10 (-730.30%)	     2778.70 ( 39.92%)	     2695.10 ( 41.73%)	     4625.23
>>                   length=266, char=0:     38684.50 (-729.88%)	     2822.16 ( 39.46%)	     2692.91 ( 42.23%)	     4661.47
>>                   length=267, char=0:     38954.10 (-732.30%)	     2867.41 ( 38.73%)	     2706.28 ( 42.18%)	     4680.31
>>                   length=268, char=0:     39155.00 (-733.08%)	     2968.76 ( 36.84%)	     2721.89 ( 42.09%)	     4700.03
>>                   length=269, char=0:     39559.30 (-737.49%)	     3057.49 ( 35.27%)	     2737.61 ( 42.04%)	     4723.54
>>                   length=270, char=0:     39813.80 (-742.51%)	     3073.64 ( 34.96%)	     2751.70 ( 41.77%)	     4725.60
>>                   length=271, char=0:     40070.60 (-744.40%)	     3103.55 ( 34.60%)	     2784.25 ( 41.33%)	     4745.43
>>                   length=512, char=0:    137515.00 (-1275.48%)	     8971.95 ( 10.26%)	     7168.66 ( 28.30%)	     9997.61
>>                   length=513, char=0:    138015.00 (-1284.40%)	     8987.07 (  9.85%)	     7242.59 ( 27.35%)	     9969.29
>>                   length=514, char=0:    138556.00 (-1286.76%)	     9200.17 (  7.92%)	     7211.49 ( 27.82%)	     9991.38
>>                   length=515, char=0:    139182.00 (-1277.21%)	     9223.64 (  8.73%)	     7232.78 ( 28.43%)	    10106.10
>>                   length=516, char=0:    139512.00 (-1288.41%)	     9306.80 (  7.38%)	     7312.15 ( 27.23%)	    10048.30
>>                   length=517, char=0:    140117.00 (-1292.65%)	     9429.22 (  6.28%)	     7273.52 ( 27.71%)	    10061.20
>>                   length=518, char=0:    140706.00 (-1294.63%)	     9463.83 (  6.20%)	     7292.57 ( 27.72%)	    10089.10
>>                   length=519, char=0:    141221.00 (-1289.12%)	     9548.99 (  6.07%)	     7312.75 ( 28.07%)	    10166.20
>>                   length=520, char=0:    141696.00 (-1297.00%)	     9713.49 (  4.27%)	     7386.44 ( 27.21%)	    10147.00
>>                   length=521, char=0:    142309.00 (-1298.82%)	     9888.41 (  2.80%)	     7361.91 ( 27.64%)	    10173.50
>>                   length=522, char=0:    142878.00 (-1292.34%)	     9909.30 (  3.43%)	     7381.22 ( 28.07%)	    10261.70
>>                   length=523, char=0:    143327.00 (-1300.69%)	     9918.78 (  3.07%)	     7462.93 ( 27.07%)	    10232.60
>>                   length=524, char=0:    143776.00 (-1301.67%)	    10055.40 (  1.97%)	     7428.56 ( 27.58%)	    10257.50
>>                   length=525, char=0:    144429.00 (-1296.79%)	    10090.80 (  2.41%)	     7449.84 ( 27.95%)	    10340.10
>>                   length=526, char=0:    144976.00 (-1305.05%)	    10178.80 (  1.35%)	     7530.66 ( 27.02%)	    10318.20
>>                   length=527, char=0:    145551.00 (-1306.63%)	    10314.40 (  0.32%)	     7498.48 ( 27.53%)	    10347.50
>>                  length=1024, char=0:    537600.00 (-2116.32%)	    34541.10 (-42.40%)	    22541.00 (  7.07%)	    24256.40
>>                  length=1025, char=0:    538490.00 (-2117.66%)	    34560.10 (-42.33%)	    22574.10 (  7.03%)	    24281.90
>>                  length=1026, char=0:    539596.00 (-2118.30%)	    34869.20 (-43.35%)	    22615.10 (  7.03%)	    24324.70
>>                  length=1027, char=0:    540544.00 (-2118.30%)	    35020.60 (-43.72%)	    22654.60 (  7.03%)	    24367.50
>>                  length=1028, char=0:    541355.00 (-2119.44%)	    35407.20 (-45.16%)	    22702.00 (  6.93%)	    24391.50
>>                  length=1029, char=0:    542678.00 (-2121.52%)	    35806.10 (-46.58%)	    22751.10 (  6.87%)	    24428.20
>>                  length=1030, char=0:    543843.00 (-2122.73%)	    35761.20 (-46.16%)	    22771.20 (  6.93%)	    24467.30
>>                  length=1031, char=0:    544725.00 (-2123.15%)	    35927.70 (-46.63%)	    22814.10 (  6.89%)	    24502.40
>>                  length=1032, char=0:    545744.00 (-2124.10%)	    35882.10 (-46.23%)	    22844.50 (  6.90%)	    24537.70
>>                  length=1033, char=0:    546968.00 (-2125.25%)	    36080.00 (-46.79%)	    22885.20 (  6.90%)	    24580.10
>>                  length=1034, char=0:    548042.00 (-2126.35%)	    36208.30 (-47.09%)	    22922.90 (  6.88%)	    24616.20
>>                  length=1035, char=0:    549066.00 (-2127.30%)	    36398.80 (-47.65%)	    22961.30 (  6.86%)	    24651.60
>>                  length=1036, char=0:    550138.00 (-2127.95%)	    36558.40 (-48.05%)	    23008.70 (  6.82%)	    24692.60
>>                  length=1037, char=0:    551170.00 (-2129.86%)	    36732.90 (-48.61%)	    23043.40 (  6.77%)	    24717.70
>>                  length=1038, char=0:    552268.00 (-2130.95%)	    36722.80 (-48.35%)	    23078.80 (  6.77%)	    24754.80
>>                  length=1039, char=0:    553270.00 (-2131.58%)	    36891.60 (-48.80%)	    23116.80 (  6.76%)	    24792.80
>>
>> 	* sysdeps/aarch64/memset.S (do_no_zva): New macro.
>> 	(do_zva_64): Likewise.
>> 	(do_zva_128): Likewise.
>> 	(__memset): Rename to MEMSET macro.
>> 	(MEMSET): Use the new macros.
>> 	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
>> 	Add memset_generic, memset_nozva, memset_zva_64,
>> 	memset_zva_128 and memset_generic.
>> 	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
>> 	(__libc_ifunc_impl_list): Add memset ifuncs.
>> 	* sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
>> 	local variable zva_size.
>> 	* sysdeps/aarch64/multiarch/memset.c: New file.
>> 	* sysdeps/aarch64/multiarch/memset_generic.S: New file.
>> 	* sysdeps/aarch64/multiarch/memset_nozva.S: New file.
>> 	* sysdeps/aarch64/multiarch/memset_zva_64.S: New file.
>> 	* sysdeps/aarch64/multiarch/memset_zva_128.S: New file.
>> 	* sysdeps/aarch64/multiarch/rtld-memset.S: New file.
>> 	* sysdeps/unix/sysv/linux/aarch64/cpu-features.c
>> 	(DCZID_DZP_MASK): New macro.
>> 	(DCZID_BS_MASK): Likewise.
>> 	(init_cpu_features): Read and set zva_size.
>> 	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h
>> 	(struct cpu_features): New member zva_size.
>> ---
>>  sysdeps/aarch64/memset.S                       | 200 +++++++++++++++----------
>>  sysdeps/aarch64/multiarch/Makefile             |   3 +-
>>  sysdeps/aarch64/multiarch/ifunc-impl-list.c    |   5 +
>>  sysdeps/aarch64/multiarch/init-arch.h          |   8 +-
>>  sysdeps/aarch64/multiarch/memset.c             |  43 ++++++
>>  sysdeps/aarch64/multiarch/memset_generic.S     |  28 ++++
>>  sysdeps/aarch64/multiarch/memset_nozva.S       |  23 +++
>>  sysdeps/aarch64/multiarch/memset_zva_128.S     |  24 +++
>>  sysdeps/aarch64/multiarch/memset_zva_64.S      |  24 +++
>>  sysdeps/aarch64/multiarch/rtld-memset.S        |  24 +++
>>  sysdeps/unix/sysv/linux/aarch64/cpu-features.c |  10 ++
>>  sysdeps/unix/sysv/linux/aarch64/cpu-features.h |   1 +
>>  12 files changed, 307 insertions(+), 86 deletions(-)
>>  create mode 100644 sysdeps/aarch64/multiarch/memset.c
>>  create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S
>>  create mode 100644 sysdeps/aarch64/multiarch/memset_nozva.S
>>  create mode 100644 sysdeps/aarch64/multiarch/memset_zva_128.S
>>  create mode 100644 sysdeps/aarch64/multiarch/memset_zva_64.S
>>  create mode 100644 sysdeps/aarch64/multiarch/rtld-memset.S
>>
>> diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
>> index 110fd22..9fea4c2 100644
>> --- a/sysdeps/aarch64/memset.S
>> +++ b/sysdeps/aarch64/memset.S
>> @@ -37,7 +37,105 @@
>>  #define zva_len x7
>>  #define zva_lenw w7
>>  
>> -ENTRY_ALIGN (__memset, 6)
>> +/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128
>> +   bytes and higher sizes.  */
>> +
>> +/* No ZVA.  */
>> +.macro do_no_zva
>> +	sub	count, dstend, dst	/* Count is 16 too large.  */
>> +	add	dst, dst, 16
>> +	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
>> +1:	stp	q0, q0, [dst], 64
>> +	stp	q0, q0, [dst, -32]
>> +	subs	count, count, 64
>> +	b.hi	1b
>> +	stp	q0, q0, [dstend, -64]
>> +	stp	q0, q0, [dstend, -32]
>> +	ret
>> +.endm
>> +
>> +/* Write the first and last 64 byte aligned block using stp rather
>> +   than using DC ZVA.  This is faster on some cores.  */
>> +.macro do_zva_64
>> +	str	q0, [dst, 16]
>> +	stp	q0, q0, [dst, 32]
>> +	bic	dst, dst, 63
>> +	stp	q0, q0, [dst, 64]
>> +	stp	q0, q0, [dst, 96]
>> +	sub	count, dstend, dst	/* Count is now 128 too large.	*/
>> +	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
>> +	add	dst, dst, 128
>> +	nop
>> +1:	dc	zva, dst
>> +	add	dst, dst, 64
>> +	subs	count, count, 64
>> +	b.hi	1b
>> +	stp	q0, q0, [dst, 0]
>> +	stp	q0, q0, [dst, 32]
>> +	stp	q0, q0, [dstend, -64]
>> +	stp	q0, q0, [dstend, -32]
>> +	ret
>> +.endm
>> +
>> +/* ZVA size of 128 bytes.  */
>> +.macro do_zva_128
>> +	str	q0, [dst, 16]
>> +	stp	q0, q0, [dst, 32]
>> +	stp	q0, q0, [dst, 64]
>> +	stp	q0, q0, [dst, 96]
>> +	bic	dst, dst, 127
>> +	sub	count, dstend, dst	/* Count is now 128 too large.	*/
>> +	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
>> +	add	dst, dst, 128
>> +1:	dc	zva, dst
>> +	add	dst, dst, 128
>> +	subs	count, count, 128
>> +	b.hi	1b
>> +	stp	q0, q0, [dstend, -128]
>> +	stp	q0, q0, [dstend, -96]
>> +	stp	q0, q0, [dstend, -64]
>> +	stp	q0, q0, [dstend, -32]
>> +	ret
>> +.endm
>> +
>> +/* ZVA size of more than 128 bytes.  */
>> +.macro do_zva_default
>> +	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
>> +	cmp	count, tmp1
>> +	blo	L(no_zva)
>> +
>> +	sub	tmp2, zva_len, 1
>> +	add	tmp1, dst, zva_len
>> +	add	dst, dst, 16
>> +	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
>> +	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
>> +	beq	2f
>> +1:	stp	q0, q0, [dst], 64
>> +	stp	q0, q0, [dst, -32]
>> +	subs	count, count, 64
>> +	b.hi	1b
>> +2:	mov	dst, tmp1
>> +	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
>> +	subs	count, count, zva_len
>> +	b.lo	4f
>> +3:	dc	zva, dst
>> +	add	dst, dst, zva_len
>> +	subs	count, count, zva_len
>> +	b.hs	3b
>> +4:	add	count, count, zva_len
>> +	subs	count, count, 64
>> +	b.ls	6f
>> +5:	stp	q0, q0, [dst], 64
>> +	stp	q0, q0, [dst, -32]
>> +	subs	count, count, 64
>> +	b.hi	5b
>> +6:	stp	q0, q0, [dstend, -64]
>> +	stp	q0, q0, [dstend, -32]
>> +	ret
>> +.endm
>> +
>> +/* Memset entry point.  */
>> +ENTRY_ALIGN (MEMSET, 6)
>>  
>>  	DELOUSE (0)
>>  	DELOUSE (2)
>> @@ -89,107 +187,45 @@ L(set96):
>>  	.p2align 3
>>  	nop
>>  L(set_long):
>> +#ifdef MEMSET_ZVA
>>  	and	valw, valw, 255
>> +#endif
>>  	bic	dst, dstin, 15
>>  	str	q0, [dstin]
>> +#ifdef MEMSET_ZVA
>>  	cmp	count, 256
>>  	ccmp	valw, 0, 0, cs
>>  	b.eq	L(try_zva)
>> +#endif
>>  L(no_zva):
>> -	sub	count, dstend, dst	/* Count is 16 too large.  */
>> -	add	dst, dst, 16
>> -	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
>> -1:	stp	q0, q0, [dst], 64
>> -	stp	q0, q0, [dst, -32]
>> -L(tail64):
>> -	subs	count, count, 64
>> -	b.hi	1b
>> -2:	stp	q0, q0, [dstend, -64]
>> -	stp	q0, q0, [dstend, -32]
>> -	ret
>> +	do_no_zva
>>  
>> -	.p2align 3
>> +#ifdef MEMSET_ZVA
>> +	.p2align 4
>>  L(try_zva):
>> +# if MEMSET_ZVA == 64
>> +	do_zva_64
>> +# elif MEMSET_ZVA == 128
>> +	do_zva_128
>> +# else
>>  	mrs	tmp1, dczid_el0
>>  	tbnz	tmp1w, 4, L(no_zva)
>>  	and	tmp1w, tmp1w, 15
>>  	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
>>  	b.ne	 L(zva_128)
>> +	do_zva_64
>>  
>> -	/* Write the first and last 64 byte aligned block using stp rather
>> -	   than using DC ZVA.  This is faster on some cores.
>> -	 */
>> -L(zva_64):
>> -	str	q0, [dst, 16]
>> -	stp	q0, q0, [dst, 32]
>> -	bic	dst, dst, 63
>> -	stp	q0, q0, [dst, 64]
>> -	stp	q0, q0, [dst, 96]
>> -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
>> -	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
>> -	add	dst, dst, 128
>> -	nop
>> -1:	dc	zva, dst
>> -	add	dst, dst, 64
>> -	subs	count, count, 64
>> -	b.hi	1b
>> -	stp	q0, q0, [dst, 0]
>> -	stp	q0, q0, [dst, 32]
>> -	stp	q0, q0, [dstend, -64]
>> -	stp	q0, q0, [dstend, -32]
>> -	ret
>> -
>> -	.p2align 3
>>  L(zva_128):
>>  	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
>>  	b.ne	L(zva_other)
>> -
>> -	str	q0, [dst, 16]
>> -	stp	q0, q0, [dst, 32]
>> -	stp	q0, q0, [dst, 64]
>> -	stp	q0, q0, [dst, 96]
>> -	bic	dst, dst, 127
>> -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
>> -	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
>> -	add	dst, dst, 128
>> -1:	dc	zva, dst
>> -	add	dst, dst, 128
>> -	subs	count, count, 128
>> -	b.hi	1b
>> -	stp	q0, q0, [dstend, -128]
>> -	stp	q0, q0, [dstend, -96]
>> -	stp	q0, q0, [dstend, -64]
>> -	stp	q0, q0, [dstend, -32]
>> -	ret
>> +	do_zva_128
>>  
>>  L(zva_other):
>>  	mov	tmp2w, 4
>>  	lsl	zva_lenw, tmp2w, tmp1w
>> -	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
>> -	cmp	count, tmp1
>> -	blo	L(no_zva)
>> +	do_zva_default
>> +# endif
>> +#endif
>>  
>> -	sub	tmp2, zva_len, 1
>> -	add	tmp1, dst, zva_len
>> -	add	dst, dst, 16
>> -	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
>> -	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
>> -	beq	2f
>> -1:	stp	q0, q0, [dst], 64
>> -	stp	q0, q0, [dst, -32]
>> -	subs	count, count, 64
>> -	b.hi	1b
>> -2:	mov	dst, tmp1
>> -	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
>> -	subs	count, count, zva_len
>> -	b.lo	4f
>> -3:	dc	zva, dst
>> -	add	dst, dst, zva_len
>> -	subs	count, count, zva_len
>> -	b.hs	3b
>> -4:	add	count, count, zva_len
>> -	b	L(tail64)
>> -
>> -END (__memset)
>> -weak_alias (__memset, memset)
>> -libc_hidden_builtin_def (memset)
>> +END (MEMSET)
>> +libc_hidden_builtin_def (MEMSET)
>> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
>> index 9aa1e79..c1e17e8 100644
>> --- a/sysdeps/aarch64/multiarch/Makefile
>> +++ b/sysdeps/aarch64/multiarch/Makefile
>> @@ -1,4 +1,5 @@
>>  ifeq ($(subdir),string)
>>  sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
>> -		   memmove_falkor
>> +		   memmove_falkor memset_generic memset_nozva memset_zva_64 \
>> +		   memset_zva_128
>>  endif
>> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
>> index 2cb74d5..fb695ce 100644
>> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
>> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
>> @@ -46,6 +46,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
>>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
>>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
>> +  IFUNC_IMPL (i, name, memset,
>> +	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva)
>> +	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64)
>> +	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128)
>> +	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
>>  
>>    return i;
>>  }
>> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
>> index 3af442c..a756dad 100644
>> --- a/sysdeps/aarch64/multiarch/init-arch.h
>> +++ b/sysdeps/aarch64/multiarch/init-arch.h
>> @@ -18,6 +18,8 @@
>>  
>>  #include <ldsodefs.h>
>>  
>> -#define INIT_ARCH()				\
>> -  uint64_t __attribute__((unused)) midr =	\
>> -    GLRO(dl_aarch64_cpu_features).midr_el1;
>> +#define INIT_ARCH()							      \
>> +  uint64_t __attribute__((unused)) midr =				      \
>> +    GLRO(dl_aarch64_cpu_features).midr_el1;				      \
>> +  unsigned __attribute__((unused)) zva_size =				      \
>> +    GLRO(dl_aarch64_cpu_features).zva_size;
>> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
>> new file mode 100644
>> index 0000000..a7e34c0
>> --- /dev/null
>> +++ b/sysdeps/aarch64/multiarch/memset.c
>> @@ -0,0 +1,43 @@
>> +/* Multiple versions of memset. AARCH64 version.
>> +   Copyright (C) 2017 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +/* Define multiple versions only for the definition in libc.  */
>> +
>> +#if IS_IN (libc)
>> +/* Redefine memset so that the compiler won't complain about the type
>> +   mismatch with the IFUNC selector in strong_alias, below.  */
>> +# undef memset
>> +# define memset __redirect_memset
>> +# include <string.h>
>> +# include <init-arch.h>
>> +
>> +extern __typeof (__redirect_memset) __libc_memset;
>> +
>> +extern __typeof (__redirect_memset) __memset_nozva attribute_hidden;
>> +extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden;
>> +extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden;
>> +extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
>> +
>> +libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva
>> +			    : (zva_size == 64 ? __memset_zva_64
>> +			       : (zva_size == 128 ? __memset_zva_128
>> +				  : __memset_generic))));
>> +
>> +# undef memset
>> +strong_alias (__libc_memset, memset);
>> +#endif
>> diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
>> new file mode 100644
>> index 0000000..8871600
>> --- /dev/null
>> +++ b/sysdeps/aarch64/multiarch/memset_generic.S
>> @@ -0,0 +1,28 @@
>> +/* Memset for aarch64, default version for internal use.
>> +   Copyright (C) 2017 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library.  If not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#if IS_IN (libc)
>> +# define MEMSET __memset_generic
>> +# define MEMSET_ZVA 1
>> +/* Add a hidden definition for use within libc.so.  */
>> +# ifdef SHARED
>> +	.globl __GI_memset; __GI_memset = __memset_generic
>> +# endif
>> +# include <sysdeps/aarch64/memset.S>
>> +#endif
>> diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S
>> new file mode 100644
>> index 0000000..2d4fc42
>> --- /dev/null
>> +++ b/sysdeps/aarch64/multiarch/memset_nozva.S
>> @@ -0,0 +1,23 @@
>> +/* Memset for aarch64, ZVA disabled.
>> +   Copyright (C) 2017 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library.  If not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#if IS_IN (libc)
>> +# define MEMSET __memset_nozva
>> +# include <sysdeps/aarch64/memset.S>
>> +#endif
>> diff --git a/sysdeps/aarch64/multiarch/memset_zva_128.S b/sysdeps/aarch64/multiarch/memset_zva_128.S
>> new file mode 100644
>> index 0000000..2c68127
>> --- /dev/null
>> +++ b/sysdeps/aarch64/multiarch/memset_zva_128.S
>> @@ -0,0 +1,24 @@
>> +/* Memset for aarch64, ZVA enabled and == 128 bytes.
>> +   Copyright (C) 2017 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library.  If not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#if IS_IN (libc)
>> +# define MEMSET __memset_zva_128
>> +# define MEMSET_ZVA 128
>> +# include <sysdeps/aarch64/memset.S>
>> +#endif
>> diff --git a/sysdeps/aarch64/multiarch/memset_zva_64.S b/sysdeps/aarch64/multiarch/memset_zva_64.S
>> new file mode 100644
>> index 0000000..ff895f9
>> --- /dev/null
>> +++ b/sysdeps/aarch64/multiarch/memset_zva_64.S
>> @@ -0,0 +1,24 @@
>> +/* Memset for aarch64, ZVA enabled and == 64 bytes.
>> +   Copyright (C) 2017 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library.  If not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#if IS_IN (libc)
>> +# define MEMSET __memset_zva_64
>> +# define MEMSET_ZVA 64
>> +# include <sysdeps/aarch64/memset.S>
>> +#endif
>> diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
>> new file mode 100644
>> index 0000000..172df42
>> --- /dev/null
>> +++ b/sysdeps/aarch64/multiarch/rtld-memset.S
>> @@ -0,0 +1,24 @@
>> +/* Memset for aarch64, for the dynamic linker.
>> +   Copyright (C) 2017 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library.  If not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#if IS_IN (rtld)
>> +# define MEMSET memset
>> +# define MEMSET_ZVA 1
>> +# include <sysdeps/aarch64/memset.S>
>> +#endif
>> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
>> index e769eeb..092ee81 100644
>> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
>> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
>> @@ -20,6 +20,9 @@
>>  #include <sys/auxv.h>
>>  #include <elf/dl-hwcaps.h>
>>  
>> +#define DCZID_DZP_MASK (1 << 4)
>> +#define DCZID_BS_MASK (0xf)
>> +
>>  #if HAVE_TUNABLES
>>  struct cpu_list
>>  {
>> @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>>      }
>>  
>>    cpu_features->midr_el1 = midr;
>> +
>> +  /* Check if ZVA is enabled.  */
>> +  unsigned dczid;
>> +  asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
>> +
>> +  if ((dczid & DCZID_DZP_MASK) == 0)
>> +    cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
>>  }
>> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
>> index 73cb53d..f2b6afd 100644
>> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
>> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
>> @@ -47,6 +47,7 @@
>>  struct cpu_features
>>  {
>>    uint64_t midr_el1;
>> +  unsigned zva_size;
>>  };
>>  
>>  #endif /* _CPU_FEATURES_AARCH64_H  */
>>
>

Patch

========================================================================================================================
                  length=256, char=0:     35936.10 (-706.66%)	     2429.88 ( 45.46%)	     2571.85 ( 42.27%)	     4454.92
                  length=257, char=0:     36209.50 (-710.17%)	     2436.12 ( 45.49%)	     2564.25 ( 42.63%)	     4469.36
                  length=258, char=0:     36507.90 (-710.21%)	     2522.06 ( 44.03%)	     2578.89 ( 42.77%)	     4505.99
                  length=259, char=0:     36764.30 (-711.99%)	     2611.61 ( 42.32%)	     2593.52 ( 42.72%)	     4527.69
                  length=260, char=0:     36943.30 (-712.62%)	     2639.06 ( 41.95%)	     2608.24 ( 42.63%)	     4546.19
                  length=261, char=0:     37287.50 (-717.27%)	     2623.07 ( 42.51%)	     2623.17 ( 42.51%)	     4562.47
                  length=262, char=0:     37573.70 (-722.44%)	     2665.51 ( 41.66%)	     2637.28 ( 42.27%)	     4568.56
                  length=263, char=0:     37833.70 (-724.30%)	     2692.70 ( 41.33%)	     2668.38 ( 41.86%)	     4589.79
                  length=264, char=0:     38136.00 (-727.49%)	     2737.30 ( 40.61%)	     2685.48 ( 41.73%)	     4608.66
                  length=265, char=0:     38403.10 (-730.30%)	     2778.70 ( 39.92%)	     2695.10 ( 41.73%)	     4625.23
                  length=266, char=0:     38684.50 (-729.88%)	     2822.16 ( 39.46%)	     2692.91 ( 42.23%)	     4661.47
                  length=267, char=0:     38954.10 (-732.30%)	     2867.41 ( 38.73%)	     2706.28 ( 42.18%)	     4680.31
                  length=268, char=0:     39155.00 (-733.08%)	     2968.76 ( 36.84%)	     2721.89 ( 42.09%)	     4700.03
                  length=269, char=0:     39559.30 (-737.49%)	     3057.49 ( 35.27%)	     2737.61 ( 42.04%)	     4723.54
                  length=270, char=0:     39813.80 (-742.51%)	     3073.64 ( 34.96%)	     2751.70 ( 41.77%)	     4725.60
                  length=271, char=0:     40070.60 (-744.40%)	     3103.55 ( 34.60%)	     2784.25 ( 41.33%)	     4745.43
                  length=512, char=0:    137515.00 (-1275.48%)	     8971.95 ( 10.26%)	     7168.66 ( 28.30%)	     9997.61
                  length=513, char=0:    138015.00 (-1284.40%)	     8987.07 (  9.85%)	     7242.59 ( 27.35%)	     9969.29
                  length=514, char=0:    138556.00 (-1286.76%)	     9200.17 (  7.92%)	     7211.49 ( 27.82%)	     9991.38
                  length=515, char=0:    139182.00 (-1277.21%)	     9223.64 (  8.73%)	     7232.78 ( 28.43%)	    10106.10
                  length=516, char=0:    139512.00 (-1288.41%)	     9306.80 (  7.38%)	     7312.15 ( 27.23%)	    10048.30
                  length=517, char=0:    140117.00 (-1292.65%)	     9429.22 (  6.28%)	     7273.52 ( 27.71%)	    10061.20
                  length=518, char=0:    140706.00 (-1294.63%)	     9463.83 (  6.20%)	     7292.57 ( 27.72%)	    10089.10
                  length=519, char=0:    141221.00 (-1289.12%)	     9548.99 (  6.07%)	     7312.75 ( 28.07%)	    10166.20
                  length=520, char=0:    141696.00 (-1297.00%)	     9713.49 (  4.27%)	     7386.44 ( 27.21%)	    10147.00
                  length=521, char=0:    142309.00 (-1298.82%)	     9888.41 (  2.80%)	     7361.91 ( 27.64%)	    10173.50
                  length=522, char=0:    142878.00 (-1292.34%)	     9909.30 (  3.43%)	     7381.22 ( 28.07%)	    10261.70
                  length=523, char=0:    143327.00 (-1300.69%)	     9918.78 (  3.07%)	     7462.93 ( 27.07%)	    10232.60
                  length=524, char=0:    143776.00 (-1301.67%)	    10055.40 (  1.97%)	     7428.56 ( 27.58%)	    10257.50
                  length=525, char=0:    144429.00 (-1296.79%)	    10090.80 (  2.41%)	     7449.84 ( 27.95%)	    10340.10
                  length=526, char=0:    144976.00 (-1305.05%)	    10178.80 (  1.35%)	     7530.66 ( 27.02%)	    10318.20
                  length=527, char=0:    145551.00 (-1306.63%)	    10314.40 (  0.32%)	     7498.48 ( 27.53%)	    10347.50
                 length=1024, char=0:    537600.00 (-2116.32%)	    34541.10 (-42.40%)	    22541.00 (  7.07%)	    24256.40
                 length=1025, char=0:    538490.00 (-2117.66%)	    34560.10 (-42.33%)	    22574.10 (  7.03%)	    24281.90
                 length=1026, char=0:    539596.00 (-2118.30%)	    34869.20 (-43.35%)	    22615.10 (  7.03%)	    24324.70
                 length=1027, char=0:    540544.00 (-2118.30%)	    35020.60 (-43.72%)	    22654.60 (  7.03%)	    24367.50
                 length=1028, char=0:    541355.00 (-2119.44%)	    35407.20 (-45.16%)	    22702.00 (  6.93%)	    24391.50
                 length=1029, char=0:    542678.00 (-2121.52%)	    35806.10 (-46.58%)	    22751.10 (  6.87%)	    24428.20
                 length=1030, char=0:    543843.00 (-2122.73%)	    35761.20 (-46.16%)	    22771.20 (  6.93%)	    24467.30
                 length=1031, char=0:    544725.00 (-2123.15%)	    35927.70 (-46.63%)	    22814.10 (  6.89%)	    24502.40
                 length=1032, char=0:    545744.00 (-2124.10%)	    35882.10 (-46.23%)	    22844.50 (  6.90%)	    24537.70
                 length=1033, char=0:    546968.00 (-2125.25%)	    36080.00 (-46.79%)	    22885.20 (  6.90%)	    24580.10
                 length=1034, char=0:    548042.00 (-2126.35%)	    36208.30 (-47.09%)	    22922.90 (  6.88%)	    24616.20
                 length=1035, char=0:    549066.00 (-2127.30%)	    36398.80 (-47.65%)	    22961.30 (  6.86%)	    24651.60
                 length=1036, char=0:    550138.00 (-2127.95%)	    36558.40 (-48.05%)	    23008.70 (  6.82%)	    24692.60
                 length=1037, char=0:    551170.00 (-2129.86%)	    36732.90 (-48.61%)	    23043.40 (  6.77%)	    24717.70
                 length=1038, char=0:    552268.00 (-2130.95%)	    36722.80 (-48.35%)	    23078.80 (  6.77%)	    24754.80
                 length=1039, char=0:    553270.00 (-2131.58%)	    36891.60 (-48.80%)	    23116.80 (  6.76%)	    24792.80

	* sysdeps/aarch64/memset.S (do_no_zva): New macro.
	(do_zva_64): Likewise.
	(do_zva_128): Likewise.
	(__memset): Rename to MEMSET macro.
	(MEMSET): Use the new macros.
	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
	Add memset_generic, memset_nozva, memset_zva_64,
	memset_zva_128 and memset_generic.
	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Add memset ifuncs.
	* sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
	local variable zva_size.
	* sysdeps/aarch64/multiarch/memset.c: New file.
	* sysdeps/aarch64/multiarch/memset_generic.S: New file.
	* sysdeps/aarch64/multiarch/memset_nozva.S: New file.
	* sysdeps/aarch64/multiarch/memset_zva_64.S: New file.
	* sysdeps/aarch64/multiarch/memset_zva_128.S: New file.
	* sysdeps/aarch64/multiarch/rtld-memset.S: New file.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.c
	(DCZID_DZP_MASK): New macro.
	(DCZID_BS_MASK): Likewise.
	(init_cpu_features): Read and set zva_size.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h
	(struct cpu_features): New member zva_size.
---
 sysdeps/aarch64/memset.S                       | 200 +++++++++++++++----------
 sysdeps/aarch64/multiarch/Makefile             |   3 +-
 sysdeps/aarch64/multiarch/ifunc-impl-list.c    |   5 +
 sysdeps/aarch64/multiarch/init-arch.h          |   8 +-
 sysdeps/aarch64/multiarch/memset.c             |  43 ++++++
 sysdeps/aarch64/multiarch/memset_generic.S     |  28 ++++
 sysdeps/aarch64/multiarch/memset_nozva.S       |  23 +++
 sysdeps/aarch64/multiarch/memset_zva_128.S     |  24 +++
 sysdeps/aarch64/multiarch/memset_zva_64.S      |  24 +++
 sysdeps/aarch64/multiarch/rtld-memset.S        |  24 +++
 sysdeps/unix/sysv/linux/aarch64/cpu-features.c |  10 ++
 sysdeps/unix/sysv/linux/aarch64/cpu-features.h |   1 +
 12 files changed, 307 insertions(+), 86 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/memset.c
 create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S
 create mode 100644 sysdeps/aarch64/multiarch/memset_nozva.S
 create mode 100644 sysdeps/aarch64/multiarch/memset_zva_128.S
 create mode 100644 sysdeps/aarch64/multiarch/memset_zva_64.S
 create mode 100644 sysdeps/aarch64/multiarch/rtld-memset.S

diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 110fd22..9fea4c2 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -37,7 +37,105 @@ 
 #define zva_len x7
 #define zva_lenw w7
 
-ENTRY_ALIGN (__memset, 6)
+/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128
+   bytes and higher sizes.  */
+
+/* No ZVA.  */
+.macro do_no_zva
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	add	dst, dst, 16
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+
+/* Write the first and last 64 byte aligned block using stp rather
+   than using DC ZVA.  This is faster on some cores.  */
+.macro do_zva_64
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+	nop
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+
+/* ZVA size of 128 bytes.  */
+.macro do_zva_128
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	bic	dst, dst, 127
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 128
+	subs	count, count, 128
+	b.hi	1b
+	stp	q0, q0, [dstend, -128]
+	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+
+/* ZVA size of more than 128 bytes.  */
+.macro do_zva_default
+	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
+	cmp	count, tmp1
+	blo	L(no_zva)
+
+	sub	tmp2, zva_len, 1
+	add	tmp1, dst, zva_len
+	add	dst, dst, 16
+	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
+	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
+	beq	2f
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	1b
+2:	mov	dst, tmp1
+	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
+	subs	count, count, zva_len
+	b.lo	4f
+3:	dc	zva, dst
+	add	dst, dst, zva_len
+	subs	count, count, zva_len
+	b.hs	3b
+4:	add	count, count, zva_len
+	subs	count, count, 64
+	b.ls	6f
+5:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	5b
+6:	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+
+/* Memset entry point.  */
+ENTRY_ALIGN (MEMSET, 6)
 
 	DELOUSE (0)
 	DELOUSE (2)
@@ -89,107 +187,45 @@  L(set96):
 	.p2align 3
 	nop
 L(set_long):
+#ifdef MEMSET_ZVA
 	and	valw, valw, 255
+#endif
 	bic	dst, dstin, 15
 	str	q0, [dstin]
+#ifdef MEMSET_ZVA
 	cmp	count, 256
 	ccmp	valw, 0, 0, cs
 	b.eq	L(try_zva)
+#endif
 L(no_zva):
-	sub	count, dstend, dst	/* Count is 16 too large.  */
-	add	dst, dst, 16
-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-L(tail64):
-	subs	count, count, 64
-	b.hi	1b
-2:	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
+	do_no_zva
 
-	.p2align 3
+#ifdef MEMSET_ZVA
+	.p2align 4
 L(try_zva):
+# if MEMSET_ZVA == 64
+	do_zva_64
+# elif MEMSET_ZVA == 128
+	do_zva_128
+# else
 	mrs	tmp1, dczid_el0
 	tbnz	tmp1w, 4, L(no_zva)
 	and	tmp1w, tmp1w, 15
 	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
 	b.ne	 L(zva_128)
+	do_zva_64
 
-	/* Write the first and last 64 byte aligned block using stp rather
-	   than using DC ZVA.  This is faster on some cores.
-	 */
-L(zva_64):
-	str	q0, [dst, 16]
-	stp	q0, q0, [dst, 32]
-	bic	dst, dst, 63
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-	nop
-1:	dc	zva, dst
-	add	dst, dst, 64
-	subs	count, count, 64
-	b.hi	1b
-	stp	q0, q0, [dst, 0]
-	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
-
-	.p2align 3
 L(zva_128):
 	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
 	b.ne	L(zva_other)
-
-	str	q0, [dst, 16]
-	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	bic	dst, dst, 127
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
-	add	dst, dst, 128
-	subs	count, count, 128
-	b.hi	1b
-	stp	q0, q0, [dstend, -128]
-	stp	q0, q0, [dstend, -96]
-	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
+	do_zva_128
 
 L(zva_other):
 	mov	tmp2w, 4
 	lsl	zva_lenw, tmp2w, tmp1w
-	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
-	cmp	count, tmp1
-	blo	L(no_zva)
+	do_zva_default
+# endif
+#endif
 
-	sub	tmp2, zva_len, 1
-	add	tmp1, dst, zva_len
-	add	dst, dst, 16
-	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
-	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
-	beq	2f
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-	subs	count, count, 64
-	b.hi	1b
-2:	mov	dst, tmp1
-	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
-	subs	count, count, zva_len
-	b.lo	4f
-3:	dc	zva, dst
-	add	dst, dst, zva_len
-	subs	count, count, zva_len
-	b.hs	3b
-4:	add	count, count, zva_len
-	b	L(tail64)
-
-END (__memset)
-weak_alias (__memset, memset)
-libc_hidden_builtin_def (memset)
+END (MEMSET)
+libc_hidden_builtin_def (MEMSET)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 9aa1e79..c1e17e8 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,4 +1,5 @@ 
 ifeq ($(subdir),string)
 sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
-		   memmove_falkor
+		   memmove_falkor memset_generic memset_nozva memset_zva_64 \
+		   memset_zva_128
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 2cb74d5..fb695ce 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -46,6 +46,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+  IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva)
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64)
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
 
   return i;
 }
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
index 3af442c..a756dad 100644
--- a/sysdeps/aarch64/multiarch/init-arch.h
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -18,6 +18,8 @@ 
 
 #include <ldsodefs.h>
 
-#define INIT_ARCH()				\
-  uint64_t __attribute__((unused)) midr =	\
-    GLRO(dl_aarch64_cpu_features).midr_el1;
+#define INIT_ARCH()							      \
+  uint64_t __attribute__((unused)) midr =				      \
+    GLRO(dl_aarch64_cpu_features).midr_el1;				      \
+  unsigned __attribute__((unused)) zva_size =				      \
+    GLRO(dl_aarch64_cpu_features).zva_size;
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
new file mode 100644
index 0000000..a7e34c0
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -0,0 +1,43 @@ 
+/* Multiple versions of memset. AARCH64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memset so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memset
+# define memset __redirect_memset
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memset) __libc_memset;
+
+extern __typeof (__redirect_memset) __memset_nozva attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
+
+libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva
+			    : (zva_size == 64 ? __memset_zva_64
+			       : (zva_size == 128 ? __memset_zva_128
+				  : __memset_generic))));
+
+# undef memset
+strong_alias (__libc_memset, memset);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
new file mode 100644
index 0000000..8871600
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_generic.S
@@ -0,0 +1,28 @@ 
+/* Memset for aarch64, default version for internal use.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMSET __memset_generic
+# define MEMSET_ZVA 1
+/* Add a hidden definition for use within libc.so.  */
+# ifdef SHARED
+	.globl __GI_memset; __GI_memset = __memset_generic
+# endif
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S
new file mode 100644
index 0000000..2d4fc42
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_nozva.S
@@ -0,0 +1,23 @@ 
+/* Memset for aarch64, ZVA disabled.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMSET __memset_nozva
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_zva_128.S b/sysdeps/aarch64/multiarch/memset_zva_128.S
new file mode 100644
index 0000000..2c68127
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_zva_128.S
@@ -0,0 +1,24 @@ 
+/* Memset for aarch64, ZVA enabled and == 128 bytes.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMSET __memset_zva_128
+# define MEMSET_ZVA 128
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_zva_64.S b/sysdeps/aarch64/multiarch/memset_zva_64.S
new file mode 100644
index 0000000..ff895f9
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_zva_64.S
@@ -0,0 +1,24 @@ 
+/* Memset for aarch64, ZVA enabled and == 64 bytes.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMSET __memset_zva_64
+# define MEMSET_ZVA 64
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
new file mode 100644
index 0000000..172df42
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/rtld-memset.S
@@ -0,0 +1,24 @@ 
+/* Memset for aarch64, for the dynamic linker.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (rtld)
+# define MEMSET memset
+# define MEMSET_ZVA 1
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
index e769eeb..092ee81 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
@@ -20,6 +20,9 @@ 
 #include <sys/auxv.h>
 #include <elf/dl-hwcaps.h>
 
+#define DCZID_DZP_MASK (1 << 4)
+#define DCZID_BS_MASK (0xf)
+
 #if HAVE_TUNABLES
 struct cpu_list
 {
@@ -72,4 +75,11 @@  init_cpu_features (struct cpu_features *cpu_features)
     }
 
   cpu_features->midr_el1 = midr;
+
+  /* Check if ZVA is enabled.  */
+  unsigned dczid;
+  asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
+
+  if ((dczid & DCZID_DZP_MASK) == 0)
+    cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
 }
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index 73cb53d..f2b6afd 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -47,6 +47,7 @@ 
 struct cpu_features
 {
   uint64_t midr_el1;
+  unsigned zva_size;
 };
 
 #endif /* _CPU_FEATURES_AARCH64_H  */