[v2] aarch64: Hoist ZVA check out of the memset function

Message ID 1507031555-3046-1-git-send-email-siddhesh@sourceware.org
State Superseded
Headers

Commit Message

Siddhesh Poyarekar Oct. 3, 2017, 11:52 a.m. UTC
  The DZP bit in the dczid_el0 register does not change dynamically, so
it is safe to read once during program startup.  Hoist the zva check
into an ifunc resolver and store the result into a static variable,
which can be read in case of non-standard zva sizes.  This effectively
adds 3 ifunc variants for memset - one for cases where zva is
disabled, one for 64 byte zva and another for 128 byte zva.  I have
retained the older memset as __memset_generic for internal libc.so use
so that the change impact is minimal.  We should eventually have a
discussion on what is more expensive, reading dczid_el0 on every
memset invocation or the indirection due to PLT.

The gains due to this are significant for falkor, with gains as high
as 80% in some cases.  Likewise for mustang, although the numbers are
slightly lower.  Here's a sample from the falkor tests:

Function: memset
Variant: walk
                                    simple_memset	__memset_nozva	__memset_zva_64	__memset_zva_default	__memset_generic
  

Comments

Szabolcs Nagy Oct. 4, 2017, 4:09 p.m. UTC | #1
On 03/10/17 12:52, Siddhesh Poyarekar wrote:
> The DZP bit in the dczid_el0 register does not change dynamically, so
> it is safe to read once during program startup.  Hoist the zva check
> into an ifunc resolver and store the result into a static variable,
> which can be read in case of non-standard zva sizes.  This effectively
> adds 3 ifunc variants for memset - one for cases where zva is
> disabled, one for 64 byte zva and another for 128 byte zva.  I have
> retained the older memset as __memset_generic for internal libc.so use
> so that the change impact is minimal.  We should eventually have a
> discussion on what is more expensive, reading dczid_el0 on every
> memset invocation or the indirection due to PLT.
> 
> The gains due to this are significant for falkor, with gains as high
> as 80% in some cases.  Likewise for mustang, although the numbers are
> slightly lower.  Here's a sample from the falkor tests:
> 

the principle is ok, but the code is still not clear to me.

> +/* Memset entry point.  */
> +ENTRY_ALIGN (MEMSET, 6)
>  
>  	DELOUSE (0)
>  	DELOUSE (2)
> @@ -46,9 +147,9 @@ ENTRY_ALIGN (__memset, 6)
>  	add	dstend, dstin, count
>  
>  	cmp	count, 96
> -	b.hi	L(set_long)
> +	b.hi	MEMSET_L(set_long)
>  	cmp	count, 16
> -	b.hs	L(set_medium)
> +	b.hs	MEMSET_L(set_medium)
>  	mov	val, v0.D[0]
>  
>  	/* Set 0..15 bytes.  */
> @@ -68,9 +169,9 @@ ENTRY_ALIGN (__memset, 6)
>  3:	ret
>  
>  	/* Set 17..96 bytes.  */
> -L(set_medium):
> +MEMSET_L(set_medium):

why do we need different label names for the different memsets?

i'd expect a localized change where there are 4 variants of a
piece of code under different ifdefs.

now it's hard to see what's going on, e.g. how code alignment of
various parts of memset is affected.

> +#else
> +	/* Memset called through PLT, so we need only one of the ZVA
> +	   variants.  */
> +# ifdef MEMSET_ZVA
> +	and	valw, valw, 255
> +# endif
> +	bic	dst, dstin, 15
> +	str	q0, [dstin]
> +# ifdef MEMSET_ZVA
> +	cmp	count, 256
> +	ccmp	valw, 0, 0, cs
> +	b.eq	MEMSET_L(try_zva)
> +# endif
> +MEMSET_L(no_zva):
> +	do_no_zva
> +# if defined MEMSET_ZVA
> +MEMSET_L(try_zva):
> +#  if MEMSET_ZVA == 64
> +	do_zva_64
> +#  elif MEMSET_ZVA == 128
> +	do_zva_128
> +#  else
> +	adrp	zva_len, __aarch64_zva_size
> +	ldr	zva_len, [zva_len, #:lo12:__aarch64_zva_size]
> +	do_zva_default

i don't think we need this global,
i'd just do mrs	dczid_el0 here.
(this code should be unreachable on current cpus)

> +#  endif
> +# endif
> +#endif
>  

> +++ b/sysdeps/aarch64/multiarch/memset_zva.S
> @@ -0,0 +1,41 @@
> +/* Memset for aarch64, ZVA enabled.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +# define MEMSET __memset_zva_64
> +# define MEMSET_ZVA 64
> +# define MEMSET_L(label) L(label ## _zva64)
> +# include <sysdeps/aarch64/memset.S>
> +
> +# undef MEMSET
> +# undef MEMSET_ZVA
> +# undef MEMSET_L
> +# define MEMSET __memset_zva_128
> +# define MEMSET_ZVA 128
> +# define MEMSET_L(label) L(label ## _zva128)
> +# include <sysdeps/aarch64/memset.S>
> +
> +# undef MEMSET
> +# undef MEMSET_ZVA
> +# undef MEMSET_L
> +# define MEMSET __memset_zva_default
> +# define MEMSET_ZVA 1
> +# define MEMSET_L(label) L(label ## _zvadef)
> +# include <sysdeps/aarch64/memset.S>
> +#endif

i think this is not ok, these should be in different files.

you put code that is never used together in the same object file.

(ideally the ifunc code for various cpus should be grouped
together by cpu and on different pages for each cpu so you
dont even load the code at runtime that is not for your cpu)

> diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
> new file mode 100644
> index 0000000..969cf14
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/rtld-memset.S
> @@ -0,0 +1,25 @@
> +/* Memset for aarch64 for use within the dynamic linker.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (rtld)
> +# define MEMSET memset
> +# define INTERNAL_MEMSET
> +# define MEMSET_L(label) L(label)
> +# include <sysdeps/aarch64/memset.S>
> +#endif

is this performance critical?
(i don't know if it's only used for early dynlinker startup code
or all libc internal memsets)

if not then we don't need it to be different variant, it could be
just a copy of the _nozva/_defaultzva case

> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> index e769eeb..092ee81 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> @@ -20,6 +20,9 @@
>  #include <sys/auxv.h>
>  #include <elf/dl-hwcaps.h>
>  
> +#define DCZID_DZP_MASK (1 << 4)
> +#define DCZID_BS_MASK (0xf)
> +
>  #if HAVE_TUNABLES
>  struct cpu_list
>  {
> @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>      }
>  
>    cpu_features->midr_el1 = midr;
> +
> +  /* Check if ZVA is enabled.  */
> +  unsigned dczid;
> +  asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
> +
> +  if ((dczid & DCZID_DZP_MASK) == 0)
> +    cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
>  }

from this code it's not clear to me that cpu_features->zva_size
is always initialized.

> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> index 73cb53d..f2b6afd 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> @@ -47,6 +47,7 @@
>  struct cpu_features
>  {
>    uint64_t midr_el1;
> +  unsigned zva_size;
>  };
>  
>  #endif /* _CPU_FEATURES_AARCH64_H  */
>
  
Siddhesh Poyarekar Oct. 5, 2017, 4:22 a.m. UTC | #2
On Wednesday 04 October 2017 09:39 PM, Szabolcs Nagy wrote:
>> +/* Memset entry point.  */
>> +ENTRY_ALIGN (MEMSET, 6)
>>  
>>  	DELOUSE (0)
>>  	DELOUSE (2)
>> @@ -46,9 +147,9 @@ ENTRY_ALIGN (__memset, 6)
>>  	add	dstend, dstin, count
>>  
>>  	cmp	count, 96
>> -	b.hi	L(set_long)
>> +	b.hi	MEMSET_L(set_long)
>>  	cmp	count, 16
>> -	b.hs	L(set_medium)
>> +	b.hs	MEMSET_L(set_medium)
>>  	mov	val, v0.D[0]
>>  
>>  	/* Set 0..15 bytes.  */
>> @@ -68,9 +169,9 @@ ENTRY_ALIGN (__memset, 6)
>>  3:	ret
>>  
>>  	/* Set 17..96 bytes.  */
>> -L(set_medium):
>> +MEMSET_L(set_medium):
> 
> why do we need different label names for the different memsets?
> 
> i'd expect a localized change where there are 4 variants of a
> piece of code under different ifdefs.

All of the variants are built in the same source file (memset_zva) which
is why they need different sets of labels.  I chose that so that I don't
blow up the number of files, but I can split it into individual files
and get rid of this macro.

> now it's hard to see what's going on, e.g. how code alignment of
> various parts of memset is affected.

Code alignments are not the focus of this change, I'll fine tune those
in further iterations because they give minor comparative gains.

> i don't think we need this global,
> i'd just do mrs	dczid_el0 here.
> (this code should be unreachable on current cpus)

Fair enough.  FWIW, I test this routine unconditionally.

>> +# undef MEMSET
>> +# undef MEMSET_ZVA
>> +# undef MEMSET_L
>> +# define MEMSET __memset_zva_default
>> +# define MEMSET_ZVA 1
>> +# define MEMSET_L(label) L(label ## _zvadef)
>> +# include <sysdeps/aarch64/memset.S>
>> +#endif
> 
> i think this is not ok, these should be in different files.
> 
> you put code that is never used together in the same object file.
> 
> (ideally the ifunc code for various cpus should be grouped
> together by cpu and on different pages for each cpu so you
> dont even load the code at runtime that is not for your cpu)

Yeah but these are not different CPU types.  Anyway, I don't have a
strong opinion on this, so I'll just split it up.  It'll get rid of the
macro you got confused about too.

>> diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
>> new file mode 100644
>> index 0000000..969cf14
>> --- /dev/null
>> +++ b/sysdeps/aarch64/multiarch/rtld-memset.S
>> @@ -0,0 +1,25 @@
>> +/* Memset for aarch64 for use within the dynamic linker.
>> +   Copyright (C) 2017 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library.  If not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#if IS_IN (rtld)
>> +# define MEMSET memset
>> +# define INTERNAL_MEMSET
>> +# define MEMSET_L(label) L(label)
>> +# include <sysdeps/aarch64/memset.S>
>> +#endif
> 
> is this performance critical?
> (i don't know if it's only used for early dynlinker startup code
> or all libc internal memsets)
> 
> if not then we don't need it to be different variant, it could be
> just a copy of the _nozva/_defaultzva case

This is used only in rtld, the libc internal one is __memset_generic.
It is not really performance critical, but given that I'll be including
one or the other anyway, I chose to include memset.S instead of
memset_nozva.S.  I can do the latter.

> from this code it's not clear to me that cpu_features->zva_size
> is always initialized.

cpu_features is static, so it is always initialized to zero.

Thanks, I'll post an updated version.

Siddhesh
  
Szabolcs Nagy Oct. 5, 2017, 9:30 a.m. UTC | #3
On 05/10/17 05:22, Siddhesh Poyarekar wrote:
> On Wednesday 04 October 2017 09:39 PM, Szabolcs Nagy wrote:
>>> +#if IS_IN (rtld)
>>> +# define MEMSET memset
>>> +# define INTERNAL_MEMSET
>>> +# define MEMSET_L(label) L(label)
>>> +# include <sysdeps/aarch64/memset.S>
>>> +#endif
>>
>> is this performance critical?
>> (i don't know if it's only used for early dynlinker startup code
>> or all libc internal memsets)
>>
>> if not then we don't need it to be different variant, it could be
>> just a copy of the _nozva/_defaultzva case
> 
> This is used only in rtld, the libc internal one is __memset_generic.
> It is not really performance critical, but given that I'll be including
> one or the other anyway, I chose to include memset.S instead of
> memset_nozva.S.  I can do the latter.
> 

it is ok to use the generic one, it just seemed that there
is a generic and a default_zva variant which could be unified
(only one of them is necessary) and then that can be used
for rtld, generic and non-conventional zva size cases.
  
Siddhesh Poyarekar Oct. 5, 2017, 9:56 a.m. UTC | #4
On Thursday 05 October 2017 03:00 PM, Szabolcs Nagy wrote:
> it is ok to use the generic one, it just seemed that there
> is a generic and a default_zva variant which could be unified
> (only one of them is necessary) and then that can be used
> for rtld, generic and non-conventional zva size cases.

default_zva is not the same as the generic one because the former
assumes that zva is enabled.  So your suggestion basically boils down to
dropping default_zva.  It's sub-optimal for the default_zva case, but it
doesn't matter for now since there aren't any cores that will actually
hit that.

Siddhesh
  

Patch

========================================================================================================================
                  length=256, char=0:         1.82 (-87.26%)	       26.99 ( 89.30%)	       25.49 ( 78.76%)	       23.48 ( 64.65%)	       14.26
                  length=257, char=0:         1.82 (-87.29%)	       26.97 ( 88.44%)	       25.77 ( 80.12%)	       24.41 ( 70.57%)	       14.31
                  length=258, char=0:         1.82 (-87.38%)	       26.27 ( 82.29%)	       25.84 ( 79.28%)	       24.33 ( 68.80%)	       14.41
                  length=259, char=0:         1.82 (-87.36%)	       26.06 ( 81.15%)	       25.72 ( 78.84%)	       24.57 ( 70.80%)	       14.38
                  length=260, char=0:         1.82 (-87.44%)	       25.35 ( 75.23%)	       25.93 ( 79.23%)	       24.34 ( 68.24%)	       14.47
                  length=261, char=0:         1.82 (-87.49%)	       26.15 ( 79.70%)	       26.01 ( 78.72%)	       24.44 ( 67.97%)	       14.55
                  length=262, char=0:         1.82 (-87.54%)	       25.91 ( 77.31%)	       26.06 ( 78.35%)	       24.33 ( 66.49%)	       14.61
                  length=263, char=0:         1.82 (-87.54%)	       25.69 ( 75.80%)	       25.96 ( 77.63%)	       24.54 ( 67.90%)	       14.61
                  length=264, char=0:         1.82 (-87.57%)	       25.31 ( 72.69%)	       26.16 ( 78.43%)	       24.63 ( 68.00%)	       14.66
                  length=265, char=0:         1.82 (-87.65%)	       25.29 ( 71.35%)	       26.25 ( 77.84%)	       24.58 ( 66.53%)	       14.76
                  length=266, char=0:         1.82 (-87.69%)	       25.10 ( 69.40%)	       26.15 ( 76.48%)	       24.77 ( 67.22%)	       14.82
                  length=267, char=0:         1.82 (-87.69%)	       24.89 ( 68.02%)	       26.20 ( 76.90%)	       24.87 ( 67.91%)	       14.81
                  length=268, char=0:         1.82 (-87.74%)	       24.07 ( 62.04%)	       26.40 ( 77.74%)	       24.95 ( 67.93%)	       14.85
                  length=269, char=0:         1.82 (-87.80%)	       23.82 ( 59.29%)	       26.47 ( 77.00%)	       24.89 ( 66.43%)	       14.96
                  length=270, char=0:         1.82 (-87.84%)	       23.65 ( 57.61%)	       26.35 ( 75.58%)	       25.07 ( 67.07%)	       15.01
                  length=271, char=0:         1.83 (-87.82%)	       23.48 ( 56.53%)	       26.39 ( 75.93%)	       25.15 ( 67.66%)	       15.00
                  length=512, char=0:         1.90 (-92.59%)	       29.25 ( 13.81%)	       36.30 ( 41.27%)	       40.95 ( 59.36%)	       25.70
                  length=513, char=0:         1.90 (-92.57%)	       29.29 ( 14.35%)	       36.63 ( 43.01%)	       40.80 ( 59.28%)	       25.61
                  length=514, char=0:         1.90 (-92.62%)	       28.61 ( 10.91%)	       36.64 ( 42.05%)	       40.89 ( 58.52%)	       25.80
                  length=515, char=0:         1.90 (-92.63%)	       28.74 ( 11.29%)	       36.68 ( 42.06%)	       40.56 ( 57.08%)	       25.82
                  length=516, char=0:         1.90 (-92.65%)	       28.33 (  9.54%)	       36.72 ( 41.96%)	       40.09 ( 55.01%)	       25.87
                  length=517, char=0:         1.90 (-92.66%)	       28.41 (  9.60%)	       36.80 ( 41.97%)	       39.43 ( 52.13%)	       25.92
                  length=518, char=0:         1.90 (-92.66%)	       28.16 (  8.45%)	       36.84 ( 41.89%)	       39.40 ( 51.77%)	       25.96
                  length=519, char=0:         1.90 (-92.67%)	       28.21 (  8.58%)	       36.86 ( 41.86%)	       40.39 ( 55.46%)	       25.98
                  length=520, char=0:         1.90 (-92.65%)	       27.53 (  6.32%)	       36.90 ( 42.49%)	       40.80 ( 57.58%)	       25.89
                  length=521, char=0:         1.90 (-92.69%)	       27.53 (  5.65%)	       36.61 ( 40.50%)	       40.86 ( 56.81%)	       26.05
                  length=522, char=0:         1.90 (-92.66%)	       27.40 (  5.59%)	       36.95 ( 42.35%)	       40.92 ( 57.64%)	       25.95
                  length=523, char=0:         1.91 (-92.71%)	       27.50 (  5.29%)	       36.69 ( 40.45%)	       40.97 ( 56.84%)	       26.12
                  length=524, char=0:         1.90 (-92.69%)	       27.33 (  5.02%)	       37.00 ( 42.18%)	       40.98 ( 57.50%)	       26.02
                  length=525, char=0:         1.91 (-92.72%)	       27.24 (  4.04%)	       36.70 ( 40.13%)	       41.04 ( 56.72%)	       26.19
                  length=526, char=0:         1.90 (-92.70%)	       27.06 (  3.73%)	       37.06 ( 42.05%)	       41.08 ( 57.44%)	       26.09
                  length=527, char=0:         1.91 (-92.74%)	       26.82 (  2.17%)	       37.06 ( 41.17%)	       41.11 ( 56.62%)	       26.25
                 length=1024, char=0:         1.95 (-95.35%)	       30.55 (-27.12%)	       46.52 ( 10.99%)	       49.89 ( 19.02%)	       41.91
                 length=1025, char=0:         1.95 (-95.31%)	       30.58 (-26.47%)	       46.57 ( 11.98%)	       49.92 ( 20.05%)	       41.59
                 length=1026, char=0:         1.95 (-95.36%)	       30.35 (-27.70%)	       46.56 ( 10.92%)	       49.45 ( 17.79%)	       41.98
                 length=1027, char=0:         1.95 (-95.36%)	       30.24 (-28.02%)	       46.20 (  9.98%)	       49.93 ( 18.88%)	       42.00
                 length=1028, char=0:         1.95 (-95.37%)	       29.75 (-29.25%)	       46.58 ( 10.76%)	       49.92 ( 18.71%)	       42.05
                 length=1029, char=0:         1.95 (-95.37%)	       29.78 (-29.24%)	       46.57 ( 10.65%)	       49.96 ( 18.72%)	       42.08
                 length=1030, char=0:         1.95 (-95.33%)	       29.77 (-28.73%)	       46.63 ( 11.63%)	       49.97 ( 19.64%)	       41.77
                 length=1031, char=0:         1.95 (-95.37%)	       29.64 (-29.68%)	       46.62 ( 10.59%)	       49.51 ( 17.46%)	       42.15
                 length=1032, char=0:         1.95 (-95.38%)	       29.60 (-29.80%)	       46.22 (  9.63%)	       49.99 ( 18.58%)	       42.16
                 length=1033, char=0:         1.95 (-95.38%)	       29.32 (-30.55%)	       46.65 ( 10.49%)	       49.95 ( 18.32%)	       42.22
                 length=1034, char=0:         1.95 (-95.39%)	       29.45 (-30.31%)	       46.67 ( 10.44%)	       50.01 ( 18.36%)	       42.25
                 length=1035, char=0:         1.95 (-95.35%)	       29.31 (-30.09%)	       46.68 ( 11.34%)	       50.02 ( 19.31%)	       41.92
                 length=1036, char=0:         1.95 (-95.40%)	       29.30 (-30.75%)	       46.66 ( 10.27%)	       49.56 ( 17.12%)	       42.32
                 length=1037, char=0:         1.95 (-95.39%)	       29.17 (-31.08%)	       46.30 (  9.38%)	       50.04 ( 18.22%)	       42.33
                 length=1038, char=0:         1.95 (-95.40%)	       29.12 (-31.30%)	       46.71 ( 10.19%)	       50.02 ( 18.01%)	       42.39
                 length=1039, char=0:         1.95 (-95.40%)	       29.19 (-31.20%)	       46.73 ( 10.14%)	       50.06 ( 18.00%)	       42.43

	* sysdeps/aarch64/memset.S (do_no_zva, do_zva_64, do_zva_128,
	do_zva_default): New macros.
	(__memset): Rename to MEMSET macro.
	(MEMSET): Use the new macros.
	(MEMSET)[INTERNAL_MEMSET]: Retain old memset.
	(MEMSET)[!INTERNAL_MEMSET]: Remove zva check.
	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
	Add memset_generic, memset_nozva and memset_zva.
	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Add memset ifuncs.
	* sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
	static variable __aarch64_zva_size and local variable
	zva_size.
	* sysdeps/aarch64/multiarch/memset.c: New file.
	* sysdeps/aarch64/multiarch/memset_generic.S: New file.
	* sysdeps/aarch64/multiarch/memset_nozva.S: New file.
	* sysdeps/aarch64/multiarch/memset_zva.S: New file.
	* sysdeps/aarch64/multiarch/rtld-memset.S: New file.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.c
	(DCZID_DZP_MASK, DCZID_BS_MASK): New macros.
	(init_cpu_features): Read and set zva_size.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h
	(struct cpu_features): New member zva_size.

Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
---
 ChangeLog                                      |  26 +++
 sysdeps/aarch64/memset.S                       | 248 +++++++++++++++----------
 sysdeps/aarch64/multiarch/Makefile             |   3 +-
 sysdeps/aarch64/multiarch/ifunc-impl-list.c    |   6 +
 sysdeps/aarch64/multiarch/init-arch.h          |   9 +-
 sysdeps/aarch64/multiarch/memset.c             |  45 +++++
 sysdeps/aarch64/multiarch/memset_generic.S     |  30 +++
 sysdeps/aarch64/multiarch/memset_nozva.S       |  24 +++
 sysdeps/aarch64/multiarch/memset_zva.S         |  41 ++++
 sysdeps/aarch64/multiarch/rtld-memset.S        |  25 +++
 sysdeps/unix/sysv/linux/aarch64/cpu-features.c |  10 +
 sysdeps/unix/sysv/linux/aarch64/cpu-features.h |   1 +
 12 files changed, 368 insertions(+), 100 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/memset.c
 create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S
 create mode 100644 sysdeps/aarch64/multiarch/memset_nozva.S
 create mode 100644 sysdeps/aarch64/multiarch/memset_zva.S
 create mode 100644 sysdeps/aarch64/multiarch/rtld-memset.S

diff --git a/ChangeLog b/ChangeLog
index d2444dd..a4f0976 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,29 @@ 
+2017-10-03  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+	* sysdeps/aarch64/memset.S (do_no_zva, do_zva_64, do_zva_128,
+	do_zva_default): New macros.
+	(__memset): Rename to MEMSET macro.
+	(MEMSET): Use the new macros.
+	(MEMSET)[INTERNAL_MEMSET]: Retain old memset.
+	(MEMSET)[!INTERNAL_MEMSET]: Remove zva check.
+	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
+	Add memset_generic, memset_nozva and memset_zva.
+	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add memset ifuncs.
+	* sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
+	static variable __aarch64_zva_size and local variable
+	zva_size.
+	* sysdeps/aarch64/multiarch/memset.c: New file.
+	* sysdeps/aarch64/multiarch/memset_generic.S: New file.
+	* sysdeps/aarch64/multiarch/memset_nozva.S: New file.
+	* sysdeps/aarch64/multiarch/memset_zva.S: New file.
+	* sysdeps/aarch64/multiarch/rtld-memset.S: New file.
+	* sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+	(DCZID_DZP_MASK, DCZID_BS_MASK): New macros.
+	(init_cpu_features): Read and set zva_size.
+	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+	(struct cpu_features): New member zva_size.
+
 2017-10-03  H.J. Lu  <hongjiu.lu@intel.com>
 
 	* elf/rtld.c (BOOTSTRAP_MAP): New.
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 110fd22..8cff3a4 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -37,7 +37,108 @@ 
 #define zva_len x7
 #define zva_lenw w7
 
-ENTRY_ALIGN (__memset, 6)
+/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128
+   bytes and higher sizes.  */
+
+#ifndef ZVA_MACROS
+# define ZVA_MACROS
+/* No ZVA.  */
+.macro do_no_zva
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	add	dst, dst, 16
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+
+/* Write the first and last 64 byte aligned block using stp rather
+   than using DC ZVA.  This is faster on some cores.  */
+.macro do_zva_64
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+	nop
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+
+/* ZVA size of 128 bytes.  */
+.macro do_zva_128
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	bic	dst, dst, 127
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 128
+	subs	count, count, 128
+	b.hi	1b
+	stp	q0, q0, [dstend, -128]
+	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+
+/* ZVA size of more than 128 bytes.  */
+.macro do_zva_default
+	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
+	cmp	count, tmp1
+	blo	MEMSET_L(no_zva)
+
+	sub	tmp2, zva_len, 1
+	add	tmp1, dst, zva_len
+	add	dst, dst, 16
+	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
+	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
+	beq	2f
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	1b
+2:	mov	dst, tmp1
+	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
+	subs	count, count, zva_len
+	b.lo	4f
+3:	dc	zva, dst
+	add	dst, dst, zva_len
+	subs	count, count, zva_len
+	b.hs	3b
+4:	add	count, count, zva_len
+	subs	count, count, 64
+	b.ls	6f
+5:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	5b
+6:	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+.endm
+#endif
+
+/* Memset entry point.  */
+ENTRY_ALIGN (MEMSET, 6)
 
 	DELOUSE (0)
 	DELOUSE (2)
@@ -46,9 +147,9 @@  ENTRY_ALIGN (__memset, 6)
 	add	dstend, dstin, count
 
 	cmp	count, 96
-	b.hi	L(set_long)
+	b.hi	MEMSET_L(set_long)
 	cmp	count, 16
-	b.hs	L(set_medium)
+	b.hs	MEMSET_L(set_medium)
 	mov	val, v0.D[0]
 
 	/* Set 0..15 bytes.  */
@@ -68,9 +169,9 @@  ENTRY_ALIGN (__memset, 6)
 3:	ret
 
 	/* Set 17..96 bytes.  */
-L(set_medium):
+MEMSET_L(set_medium):
 	str	q0, [dstin]
-	tbnz	count, 6, L(set96)
+	tbnz	count, 6, MEMSET_L(set96)
 	str	q0, [dstend, -16]
 	tbz	count, 5, 1f
 	str	q0, [dstin, 16]
@@ -80,7 +181,7 @@  L(set_medium):
 	.p2align 4
 	/* Set 64..96 bytes.  Write 64 bytes from the start and
 	   32 bytes from the end.  */
-L(set96):
+MEMSET_L(set96):
 	str	q0, [dstin, 16]
 	stp	q0, q0, [dstin, 32]
 	stp	q0, q0, [dstend, -32]
@@ -88,108 +189,63 @@  L(set96):
 
 	.p2align 3
 	nop
-L(set_long):
+MEMSET_L(set_long):
+#ifdef INTERNAL_MEMSET
 	and	valw, valw, 255
 	bic	dst, dstin, 15
 	str	q0, [dstin]
 	cmp	count, 256
 	ccmp	valw, 0, 0, cs
-	b.eq	L(try_zva)
-L(no_zva):
-	sub	count, dstend, dst	/* Count is 16 too large.  */
-	add	dst, dst, 16
-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-L(tail64):
-	subs	count, count, 64
-	b.hi	1b
-2:	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
+	b.eq	MEMSET_L(try_zva)
 
-	.p2align 3
-L(try_zva):
+MEMSET_L(no_zva):
+	do_no_zva
+
+	.p2align 4
+MEMSET_L(try_zva):
 	mrs	tmp1, dczid_el0
-	tbnz	tmp1w, 4, L(no_zva)
 	and	tmp1w, tmp1w, 15
 	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
-	b.ne	 L(zva_128)
+	b.ne	 MEMSET_L(zva_128)
+	do_zva_64
 
-	/* Write the first and last 64 byte aligned block using stp rather
-	   than using DC ZVA.  This is faster on some cores.
-	 */
-L(zva_64):
-	str	q0, [dst, 16]
-	stp	q0, q0, [dst, 32]
-	bic	dst, dst, 63
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-	nop
-1:	dc	zva, dst
-	add	dst, dst, 64
-	subs	count, count, 64
-	b.hi	1b
-	stp	q0, q0, [dst, 0]
-	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
-
-	.p2align 3
-L(zva_128):
+MEMSET_L(zva_128):
 	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
-	b.ne	L(zva_other)
+	b.ne	MEMSET_L(zva_other)
+	do_zva_128
 
-	str	q0, [dst, 16]
-	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	bic	dst, dst, 127
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
-	add	dst, dst, 128
-	subs	count, count, 128
-	b.hi	1b
-	stp	q0, q0, [dstend, -128]
-	stp	q0, q0, [dstend, -96]
-	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
-
-L(zva_other):
+MEMSET_L(zva_other):
 	mov	tmp2w, 4
 	lsl	zva_lenw, tmp2w, tmp1w
-	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
-	cmp	count, tmp1
-	blo	L(no_zva)
+	do_zva_default
+#else
+	/* Memset called through PLT, so we need only one of the ZVA
+	   variants.  */
+# ifdef MEMSET_ZVA
+	and	valw, valw, 255
+# endif
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+# ifdef MEMSET_ZVA
+	cmp	count, 256
+	ccmp	valw, 0, 0, cs
+	b.eq	MEMSET_L(try_zva)
+# endif
+MEMSET_L(no_zva):
+	do_no_zva
+# if defined MEMSET_ZVA
+MEMSET_L(try_zva):
+#  if MEMSET_ZVA == 64
+	do_zva_64
+#  elif MEMSET_ZVA == 128
+	do_zva_128
+#  else
+	adrp	zva_len, __aarch64_zva_size
+	ldr	zva_len, [zva_len, #:lo12:__aarch64_zva_size]
+	do_zva_default
+#  endif
+# endif
+#endif
 
-	sub	tmp2, zva_len, 1
-	add	tmp1, dst, zva_len
-	add	dst, dst, 16
-	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
-	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
-	beq	2f
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-	subs	count, count, 64
-	b.hi	1b
-2:	mov	dst, tmp1
-	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
-	subs	count, count, zva_len
-	b.lo	4f
-3:	dc	zva, dst
-	add	dst, dst, zva_len
-	subs	count, count, zva_len
-	b.hs	3b
-4:	add	count, count, zva_len
-	b	L(tail64)
-
-END (__memset)
-weak_alias (__memset, memset)
-libc_hidden_builtin_def (memset)
+END (MEMSET)
+libc_hidden_builtin_def (MEMSET)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 164ba1a..7fe8254 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,3 +1,4 @@ 
 ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
+		   memset_generic memset_nozva memset_zva
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 8e873b3..fec52b7 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -45,6 +45,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, memmove,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+  IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva)
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64)
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128)
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size > 0), __memset_zva_default)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
 
   return i;
 }
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
index 3af442c..541c27e 100644
--- a/sysdeps/aarch64/multiarch/init-arch.h
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -18,6 +18,9 @@ 
 
 #include <ldsodefs.h>
 
-#define INIT_ARCH()				\
-  uint64_t __attribute__((unused)) midr =	\
-    GLRO(dl_aarch64_cpu_features).midr_el1;
+#define INIT_ARCH()							      \
+  uint64_t __attribute__((unused)) midr =				      \
+    GLRO(dl_aarch64_cpu_features).midr_el1;				      \
+  extern unsigned __aarch64_zva_size;					      \
+  unsigned __attribute__((unused)) zva_size = __aarch64_zva_size =	      \
+    GLRO(dl_aarch64_cpu_features).zva_size;
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
new file mode 100644
index 0000000..18ffd73
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -0,0 +1,45 @@ 
+/* Multiple versions of memset. AARCH64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memset so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memset
+# define memset __redirect_memset
+# include <string.h>
+# include <init-arch.h>
+
+unsigned __aarch64_zva_size;
+
+extern __typeof (__redirect_memset) __libc_memset;
+
+extern __typeof (__redirect_memset) __memset_nozva attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva_default attribute_hidden;
+
+libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva
+			    : (zva_size == 64 ? __memset_zva_64
+			       : (zva_size == 128 ? __memset_zva_128
+				  : __memset_zva_default))));
+
+# undef memset
+strong_alias (__libc_memset, memset);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
new file mode 100644
index 0000000..8d32c43
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_generic.S
@@ -0,0 +1,30 @@ 
+/* Memset for aarch64, default version for internal use.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMSET __memset_generic
+# define INTERNAL_MEMSET
+# define MEMSET_L(label) L(label)
+/* Add a hidden definition for use within libc.so.  */
+# ifdef SHARED
+	.globl __GI_memset; __GI_memset = __memset_generic
+# endif
+
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S
new file mode 100644
index 0000000..a41b575
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_nozva.S
@@ -0,0 +1,24 @@ 
+/* Memset for aarch64, ZVA disabled.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMSET __memset_nozva
+# define MEMSET_L(label) L(label)
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/memset_zva.S b/sysdeps/aarch64/multiarch/memset_zva.S
new file mode 100644
index 0000000..5d02b89
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_zva.S
@@ -0,0 +1,41 @@ 
+/* Memset for aarch64, ZVA enabled.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMSET __memset_zva_64
+# define MEMSET_ZVA 64
+# define MEMSET_L(label) L(label ## _zva64)
+# include <sysdeps/aarch64/memset.S>
+
+# undef MEMSET
+# undef MEMSET_ZVA
+# undef MEMSET_L
+# define MEMSET __memset_zva_128
+# define MEMSET_ZVA 128
+# define MEMSET_L(label) L(label ## _zva128)
+# include <sysdeps/aarch64/memset.S>
+
+# undef MEMSET
+# undef MEMSET_ZVA
+# undef MEMSET_L
+# define MEMSET __memset_zva_default
+# define MEMSET_ZVA 1
+# define MEMSET_L(label) L(label ## _zvadef)
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
new file mode 100644
index 0000000..969cf14
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/rtld-memset.S
@@ -0,0 +1,25 @@ 
+/* Memset for aarch64 for use within the dynamic linker.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (rtld)
+# define MEMSET memset
+# define INTERNAL_MEMSET
+# define MEMSET_L(label) L(label)
+# include <sysdeps/aarch64/memset.S>
+#endif
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
index e769eeb..092ee81 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
@@ -20,6 +20,9 @@ 
 #include <sys/auxv.h>
 #include <elf/dl-hwcaps.h>
 
+#define DCZID_DZP_MASK (1 << 4)
+#define DCZID_BS_MASK (0xf)
+
 #if HAVE_TUNABLES
 struct cpu_list
 {
@@ -72,4 +75,11 @@  init_cpu_features (struct cpu_features *cpu_features)
     }
 
   cpu_features->midr_el1 = midr;
+
+  /* Check if ZVA is enabled.  */
+  unsigned dczid;
+  asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
+
+  if ((dczid & DCZID_DZP_MASK) == 0)
+    cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
 }
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index 73cb53d..f2b6afd 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -47,6 +47,7 @@ 
 struct cpu_features
 {
   uint64_t midr_el1;
+  unsigned zva_size;
 };
 
 #endif /* _CPU_FEATURES_AARCH64_H  */