[1/2] Hoist ZVA check out of the function
Commit Message
The DZP bit in the dczid_el0 register does not change dynamically, so
it is safe to read once during program startup. Hoist the zva check
into an ifunc resolver and store the result into a static variable,
which can be read in case of non-standard zva sizes. This effectively
adds 3 ifunc variants for memset - one for cases where zva is
disabled, one for 64 byte zva and another for 128 byte zva. I have
retained the older memset as __memset_generic for internal libc.so use
so that the change impact is minimal. We should eventually have a
discussion on what is more expensive, reading dczid_el0 on every
memset invocation or the indirection due to PLT.
The gains due to this are significant for falkor, with gains as high
as 80% in some cases. Likewise for mustang, although the numbers are
slightly lower. Here's a sample from the falkor tests:
Function: memset
Variant: walk
simple_memset __memset_nozva __memset_zva_64 __memset_zva_default __memset_generic
Comments
Ping.
On Tuesday 19 September 2017 08:53 PM, Siddhesh Poyarekar wrote:
> The DZP bit in the dczid_el0 register does not change dynamically, so
> it is safe to read once during program startup. Hoist the zva check
> into an ifunc resolver and store the result into a static variable,
> which can be read in case of non-standard zva sizes. This effectively
> adds 3 ifunc variants for memset - one for cases where zva is
> disabled, one for 64 byte zva and another for 128 byte zva. I have
> retained the older memset as __memset_generic for internal libc.so use
> so that the change impact is minimal. We should eventually have a
> discussion on what is more expensive, reading dczid_el0 on every
> memset invocation or the indirection due to PLT.
>
> The gains due to this are significant for falkor, with gains as high
> as 80% in some cases. Likewise for mustang, although the numbers are
> slightly lower. Here's a sample from the falkor tests:
>
> Function: memset
> Variant: walk
> simple_memset __memset_nozva __memset_zva_64 __memset_zva_default __memset_generic
> ========================================================================================================================
> length=256, char=0: 1.82 (-87.26%) 26.99 ( 89.30%) 25.49 ( 78.76%) 23.48 ( 64.65%) 14.26
> length=257, char=0: 1.82 (-87.29%) 26.97 ( 88.44%) 25.77 ( 80.12%) 24.41 ( 70.57%) 14.31
> length=258, char=0: 1.82 (-87.38%) 26.27 ( 82.29%) 25.84 ( 79.28%) 24.33 ( 68.80%) 14.41
> length=259, char=0: 1.82 (-87.36%) 26.06 ( 81.15%) 25.72 ( 78.84%) 24.57 ( 70.80%) 14.38
> length=260, char=0: 1.82 (-87.44%) 25.35 ( 75.23%) 25.93 ( 79.23%) 24.34 ( 68.24%) 14.47
> length=261, char=0: 1.82 (-87.49%) 26.15 ( 79.70%) 26.01 ( 78.72%) 24.44 ( 67.97%) 14.55
> length=262, char=0: 1.82 (-87.54%) 25.91 ( 77.31%) 26.06 ( 78.35%) 24.33 ( 66.49%) 14.61
> length=263, char=0: 1.82 (-87.54%) 25.69 ( 75.80%) 25.96 ( 77.63%) 24.54 ( 67.90%) 14.61
> length=264, char=0: 1.82 (-87.57%) 25.31 ( 72.69%) 26.16 ( 78.43%) 24.63 ( 68.00%) 14.66
> length=265, char=0: 1.82 (-87.65%) 25.29 ( 71.35%) 26.25 ( 77.84%) 24.58 ( 66.53%) 14.76
> length=266, char=0: 1.82 (-87.69%) 25.10 ( 69.40%) 26.15 ( 76.48%) 24.77 ( 67.22%) 14.82
> length=267, char=0: 1.82 (-87.69%) 24.89 ( 68.02%) 26.20 ( 76.90%) 24.87 ( 67.91%) 14.81
> length=268, char=0: 1.82 (-87.74%) 24.07 ( 62.04%) 26.40 ( 77.74%) 24.95 ( 67.93%) 14.85
> length=269, char=0: 1.82 (-87.80%) 23.82 ( 59.29%) 26.47 ( 77.00%) 24.89 ( 66.43%) 14.96
> length=270, char=0: 1.82 (-87.84%) 23.65 ( 57.61%) 26.35 ( 75.58%) 25.07 ( 67.07%) 15.01
> length=271, char=0: 1.83 (-87.82%) 23.48 ( 56.53%) 26.39 ( 75.93%) 25.15 ( 67.66%) 15.00
> length=512, char=0: 1.90 (-92.59%) 29.25 ( 13.81%) 36.30 ( 41.27%) 40.95 ( 59.36%) 25.70
> length=513, char=0: 1.90 (-92.57%) 29.29 ( 14.35%) 36.63 ( 43.01%) 40.80 ( 59.28%) 25.61
> length=514, char=0: 1.90 (-92.62%) 28.61 ( 10.91%) 36.64 ( 42.05%) 40.89 ( 58.52%) 25.80
> length=515, char=0: 1.90 (-92.63%) 28.74 ( 11.29%) 36.68 ( 42.06%) 40.56 ( 57.08%) 25.82
> length=516, char=0: 1.90 (-92.65%) 28.33 ( 9.54%) 36.72 ( 41.96%) 40.09 ( 55.01%) 25.87
> length=517, char=0: 1.90 (-92.66%) 28.41 ( 9.60%) 36.80 ( 41.97%) 39.43 ( 52.13%) 25.92
> length=518, char=0: 1.90 (-92.66%) 28.16 ( 8.45%) 36.84 ( 41.89%) 39.40 ( 51.77%) 25.96
> length=519, char=0: 1.90 (-92.67%) 28.21 ( 8.58%) 36.86 ( 41.86%) 40.39 ( 55.46%) 25.98
> length=520, char=0: 1.90 (-92.65%) 27.53 ( 6.32%) 36.90 ( 42.49%) 40.80 ( 57.58%) 25.89
> length=521, char=0: 1.90 (-92.69%) 27.53 ( 5.65%) 36.61 ( 40.50%) 40.86 ( 56.81%) 26.05
> length=522, char=0: 1.90 (-92.66%) 27.40 ( 5.59%) 36.95 ( 42.35%) 40.92 ( 57.64%) 25.95
> length=523, char=0: 1.91 (-92.71%) 27.50 ( 5.29%) 36.69 ( 40.45%) 40.97 ( 56.84%) 26.12
> length=524, char=0: 1.90 (-92.69%) 27.33 ( 5.02%) 37.00 ( 42.18%) 40.98 ( 57.50%) 26.02
> length=525, char=0: 1.91 (-92.72%) 27.24 ( 4.04%) 36.70 ( 40.13%) 41.04 ( 56.72%) 26.19
> length=526, char=0: 1.90 (-92.70%) 27.06 ( 3.73%) 37.06 ( 42.05%) 41.08 ( 57.44%) 26.09
> length=527, char=0: 1.91 (-92.74%) 26.82 ( 2.17%) 37.06 ( 41.17%) 41.11 ( 56.62%) 26.25
> length=1024, char=0: 1.95 (-95.35%) 30.55 (-27.12%) 46.52 ( 10.99%) 49.89 ( 19.02%) 41.91
> length=1025, char=0: 1.95 (-95.31%) 30.58 (-26.47%) 46.57 ( 11.98%) 49.92 ( 20.05%) 41.59
> length=1026, char=0: 1.95 (-95.36%) 30.35 (-27.70%) 46.56 ( 10.92%) 49.45 ( 17.79%) 41.98
> length=1027, char=0: 1.95 (-95.36%) 30.24 (-28.02%) 46.20 ( 9.98%) 49.93 ( 18.88%) 42.00
> length=1028, char=0: 1.95 (-95.37%) 29.75 (-29.25%) 46.58 ( 10.76%) 49.92 ( 18.71%) 42.05
> length=1029, char=0: 1.95 (-95.37%) 29.78 (-29.24%) 46.57 ( 10.65%) 49.96 ( 18.72%) 42.08
> length=1030, char=0: 1.95 (-95.33%) 29.77 (-28.73%) 46.63 ( 11.63%) 49.97 ( 19.64%) 41.77
> length=1031, char=0: 1.95 (-95.37%) 29.64 (-29.68%) 46.62 ( 10.59%) 49.51 ( 17.46%) 42.15
> length=1032, char=0: 1.95 (-95.38%) 29.60 (-29.80%) 46.22 ( 9.63%) 49.99 ( 18.58%) 42.16
> length=1033, char=0: 1.95 (-95.38%) 29.32 (-30.55%) 46.65 ( 10.49%) 49.95 ( 18.32%) 42.22
> length=1034, char=0: 1.95 (-95.39%) 29.45 (-30.31%) 46.67 ( 10.44%) 50.01 ( 18.36%) 42.25
> length=1035, char=0: 1.95 (-95.35%) 29.31 (-30.09%) 46.68 ( 11.34%) 50.02 ( 19.31%) 41.92
> length=1036, char=0: 1.95 (-95.40%) 29.30 (-30.75%) 46.66 ( 10.27%) 49.56 ( 17.12%) 42.32
> length=1037, char=0: 1.95 (-95.39%) 29.17 (-31.08%) 46.30 ( 9.38%) 50.04 ( 18.22%) 42.33
> length=1038, char=0: 1.95 (-95.40%) 29.12 (-31.30%) 46.71 ( 10.19%) 50.02 ( 18.01%) 42.39
> length=1039, char=0: 1.95 (-95.40%) 29.19 (-31.20%) 46.73 ( 10.14%) 50.06 ( 18.00%) 42.43
>
> * sysdeps/aarch64/memset.S (do_no_zva): New macro.
> (do_zva_64): Likewise.
> (do_zva_128): Likewise.
> (do_zva_default): Likewise.
> (__memset): Rename to MEMSET macro.
> (MEMSET): Use the new macros.
> (MEMSET)[INTERNAL_MEMSET]: Retain old memset.
> (MEMSET)[!INTERNAL_MEMSET]: Remove zva check.
> * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
> Add memset_generic, memset_nozva and memset_zva.
> * sysdeps/aarch64/multiarch/ifunc-impl-list.c
> (__libc_ifunc_impl_list): Add memset ifuncs.
> * sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
> static variable __aarch64_zva_size and local variable
> zva_size.
> * sysdeps/aarch64/multiarch/memset.c: New file.
> * sysdeps/aarch64/multiarch/memset_generic.S: New file.
> * sysdeps/aarch64/multiarch/memset_nozva.S: New file.
> * sysdeps/aarch64/multiarch/memset_zva.S: New file.
> * sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> (DCZID_DZP_MASK): New macro.
> (DCZID_BS_MASK): Likewise.
> (init_cpu_features): Read and set zva_size.
> * sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> (struct cpu_features): New member zva_size.
> ---
> sysdeps/aarch64/memset.S | 248 +++++++++++++++----------
> sysdeps/aarch64/multiarch/Makefile | 2 +-
> sysdeps/aarch64/multiarch/ifunc-impl-list.c | 6 +
> sysdeps/aarch64/multiarch/init-arch.h | 9 +-
> sysdeps/aarch64/multiarch/memset.c | 47 +++++
> sysdeps/aarch64/multiarch/memset_generic.S | 27 +++
> sysdeps/aarch64/multiarch/memset_nozva.S | 22 +++
> sysdeps/aarch64/multiarch/memset_zva.S | 41 ++++
> sysdeps/unix/sysv/linux/aarch64/cpu-features.c | 10 +
> sysdeps/unix/sysv/linux/aarch64/cpu-features.h | 1 +
> 10 files changed, 313 insertions(+), 100 deletions(-)
> create mode 100644 sysdeps/aarch64/multiarch/memset.c
> create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S
> create mode 100644 sysdeps/aarch64/multiarch/memset_nozva.S
> create mode 100644 sysdeps/aarch64/multiarch/memset_zva.S
>
> diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
> index 110fd22..8cff3a4 100644
> --- a/sysdeps/aarch64/memset.S
> +++ b/sysdeps/aarch64/memset.S
> @@ -37,7 +37,108 @@
> #define zva_len x7
> #define zva_lenw w7
>
> -ENTRY_ALIGN (__memset, 6)
> +/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128
> + bytes and higher sizes. */
> +
> +#ifndef ZVA_MACROS
> +# define ZVA_MACROS
> +/* No ZVA. */
> +.macro do_no_zva
> + sub count, dstend, dst /* Count is 16 too large. */
> + add dst, dst, 16
> + sub count, count, 64 + 16 /* Adjust count and bias for loop. */
> +1: stp q0, q0, [dst], 64
> + stp q0, q0, [dst, -32]
> + subs count, count, 64
> + b.hi 1b
> + stp q0, q0, [dstend, -64]
> + stp q0, q0, [dstend, -32]
> + ret
> +.endm
> +
> +/* Write the first and last 64 byte aligned block using stp rather
> + than using DC ZVA. This is faster on some cores. */
> +.macro do_zva_64
> + str q0, [dst, 16]
> + stp q0, q0, [dst, 32]
> + bic dst, dst, 63
> + stp q0, q0, [dst, 64]
> + stp q0, q0, [dst, 96]
> + sub count, dstend, dst /* Count is now 128 too large. */
> + sub count, count, 128+64+64 /* Adjust count and bias for loop. */
> + add dst, dst, 128
> + nop
> +1: dc zva, dst
> + add dst, dst, 64
> + subs count, count, 64
> + b.hi 1b
> + stp q0, q0, [dst, 0]
> + stp q0, q0, [dst, 32]
> + stp q0, q0, [dstend, -64]
> + stp q0, q0, [dstend, -32]
> + ret
> +.endm
> +
> +/* ZVA size of 128 bytes. */
> +.macro do_zva_128
> + str q0, [dst, 16]
> + stp q0, q0, [dst, 32]
> + stp q0, q0, [dst, 64]
> + stp q0, q0, [dst, 96]
> + bic dst, dst, 127
> + sub count, dstend, dst /* Count is now 128 too large. */
> + sub count, count, 128+128 /* Adjust count and bias for loop. */
> + add dst, dst, 128
> +1: dc zva, dst
> + add dst, dst, 128
> + subs count, count, 128
> + b.hi 1b
> + stp q0, q0, [dstend, -128]
> + stp q0, q0, [dstend, -96]
> + stp q0, q0, [dstend, -64]
> + stp q0, q0, [dstend, -32]
> + ret
> +.endm
> +
> +/* ZVA size of more than 128 bytes. */
> +.macro do_zva_default
> + add tmp1, zva_len, 64 /* Max alignment bytes written. */
> + cmp count, tmp1
> + blo MEMSET_L(no_zva)
> +
> + sub tmp2, zva_len, 1
> + add tmp1, dst, zva_len
> + add dst, dst, 16
> + subs count, tmp1, dst /* Actual alignment bytes to write. */
> + bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
> + beq 2f
> +1: stp q0, q0, [dst], 64
> + stp q0, q0, [dst, -32]
> + subs count, count, 64
> + b.hi 1b
> +2: mov dst, tmp1
> + sub count, dstend, tmp1 /* Remaining bytes to write. */
> + subs count, count, zva_len
> + b.lo 4f
> +3: dc zva, dst
> + add dst, dst, zva_len
> + subs count, count, zva_len
> + b.hs 3b
> +4: add count, count, zva_len
> + subs count, count, 64
> + b.ls 6f
> +5: stp q0, q0, [dst], 64
> + stp q0, q0, [dst, -32]
> + subs count, count, 64
> + b.hi 5b
> +6: stp q0, q0, [dstend, -64]
> + stp q0, q0, [dstend, -32]
> + ret
> +.endm
> +#endif
> +
> +/* Memset entry point. */
> +ENTRY_ALIGN (MEMSET, 6)
>
> DELOUSE (0)
> DELOUSE (2)
> @@ -46,9 +147,9 @@ ENTRY_ALIGN (__memset, 6)
> add dstend, dstin, count
>
> cmp count, 96
> - b.hi L(set_long)
> + b.hi MEMSET_L(set_long)
> cmp count, 16
> - b.hs L(set_medium)
> + b.hs MEMSET_L(set_medium)
> mov val, v0.D[0]
>
> /* Set 0..15 bytes. */
> @@ -68,9 +169,9 @@ ENTRY_ALIGN (__memset, 6)
> 3: ret
>
> /* Set 17..96 bytes. */
> -L(set_medium):
> +MEMSET_L(set_medium):
> str q0, [dstin]
> - tbnz count, 6, L(set96)
> + tbnz count, 6, MEMSET_L(set96)
> str q0, [dstend, -16]
> tbz count, 5, 1f
> str q0, [dstin, 16]
> @@ -80,7 +181,7 @@ L(set_medium):
> .p2align 4
> /* Set 64..96 bytes. Write 64 bytes from the start and
> 32 bytes from the end. */
> -L(set96):
> +MEMSET_L(set96):
> str q0, [dstin, 16]
> stp q0, q0, [dstin, 32]
> stp q0, q0, [dstend, -32]
> @@ -88,108 +189,63 @@ L(set96):
>
> .p2align 3
> nop
> -L(set_long):
> +MEMSET_L(set_long):
> +#ifdef INTERNAL_MEMSET
> and valw, valw, 255
> bic dst, dstin, 15
> str q0, [dstin]
> cmp count, 256
> ccmp valw, 0, 0, cs
> - b.eq L(try_zva)
> -L(no_zva):
> - sub count, dstend, dst /* Count is 16 too large. */
> - add dst, dst, 16
> - sub count, count, 64 + 16 /* Adjust count and bias for loop. */
> -1: stp q0, q0, [dst], 64
> - stp q0, q0, [dst, -32]
> -L(tail64):
> - subs count, count, 64
> - b.hi 1b
> -2: stp q0, q0, [dstend, -64]
> - stp q0, q0, [dstend, -32]
> - ret
> + b.eq MEMSET_L(try_zva)
>
> - .p2align 3
> -L(try_zva):
> +MEMSET_L(no_zva):
> + do_no_zva
> +
> + .p2align 4
> +MEMSET_L(try_zva):
> mrs tmp1, dczid_el0
> - tbnz tmp1w, 4, L(no_zva)
> and tmp1w, tmp1w, 15
> cmp tmp1w, 4 /* ZVA size is 64 bytes. */
> - b.ne L(zva_128)
> + b.ne MEMSET_L(zva_128)
> + do_zva_64
>
> - /* Write the first and last 64 byte aligned block using stp rather
> - than using DC ZVA. This is faster on some cores.
> - */
> -L(zva_64):
> - str q0, [dst, 16]
> - stp q0, q0, [dst, 32]
> - bic dst, dst, 63
> - stp q0, q0, [dst, 64]
> - stp q0, q0, [dst, 96]
> - sub count, dstend, dst /* Count is now 128 too large. */
> - sub count, count, 128+64+64 /* Adjust count and bias for loop. */
> - add dst, dst, 128
> - nop
> -1: dc zva, dst
> - add dst, dst, 64
> - subs count, count, 64
> - b.hi 1b
> - stp q0, q0, [dst, 0]
> - stp q0, q0, [dst, 32]
> - stp q0, q0, [dstend, -64]
> - stp q0, q0, [dstend, -32]
> - ret
> -
> - .p2align 3
> -L(zva_128):
> +MEMSET_L(zva_128):
> cmp tmp1w, 5 /* ZVA size is 128 bytes. */
> - b.ne L(zva_other)
> + b.ne MEMSET_L(zva_other)
> + do_zva_128
>
> - str q0, [dst, 16]
> - stp q0, q0, [dst, 32]
> - stp q0, q0, [dst, 64]
> - stp q0, q0, [dst, 96]
> - bic dst, dst, 127
> - sub count, dstend, dst /* Count is now 128 too large. */
> - sub count, count, 128+128 /* Adjust count and bias for loop. */
> - add dst, dst, 128
> -1: dc zva, dst
> - add dst, dst, 128
> - subs count, count, 128
> - b.hi 1b
> - stp q0, q0, [dstend, -128]
> - stp q0, q0, [dstend, -96]
> - stp q0, q0, [dstend, -64]
> - stp q0, q0, [dstend, -32]
> - ret
> -
> -L(zva_other):
> +MEMSET_L(zva_other):
> mov tmp2w, 4
> lsl zva_lenw, tmp2w, tmp1w
> - add tmp1, zva_len, 64 /* Max alignment bytes written. */
> - cmp count, tmp1
> - blo L(no_zva)
> + do_zva_default
> +#else
> + /* Memset called through PLT, so we need only one of the ZVA
> + variants. */
> +# ifdef MEMSET_ZVA
> + and valw, valw, 255
> +# endif
> + bic dst, dstin, 15
> + str q0, [dstin]
> +# ifdef MEMSET_ZVA
> + cmp count, 256
> + ccmp valw, 0, 0, cs
> + b.eq MEMSET_L(try_zva)
> +# endif
> +MEMSET_L(no_zva):
> + do_no_zva
> +# if defined MEMSET_ZVA
> +MEMSET_L(try_zva):
> +# if MEMSET_ZVA == 64
> + do_zva_64
> +# elif MEMSET_ZVA == 128
> + do_zva_128
> +# else
> + adrp zva_len, __aarch64_zva_size
> + ldr zva_len, [zva_len, #:lo12:__aarch64_zva_size]
> + do_zva_default
> +# endif
> +# endif
> +#endif
>
> - sub tmp2, zva_len, 1
> - add tmp1, dst, zva_len
> - add dst, dst, 16
> - subs count, tmp1, dst /* Actual alignment bytes to write. */
> - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
> - beq 2f
> -1: stp q0, q0, [dst], 64
> - stp q0, q0, [dst, -32]
> - subs count, count, 64
> - b.hi 1b
> -2: mov dst, tmp1
> - sub count, dstend, tmp1 /* Remaining bytes to write. */
> - subs count, count, zva_len
> - b.lo 4f
> -3: dc zva, dst
> - add dst, dst, zva_len
> - subs count, count, zva_len
> - b.hs 3b
> -4: add count, count, zva_len
> - b L(tail64)
> -
> -END (__memset)
> -weak_alias (__memset, memset)
> -libc_hidden_builtin_def (memset)
> +END (MEMSET)
> +libc_hidden_builtin_def (MEMSET)
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index 9aa1e79..f611182 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -1,4 +1,4 @@
> ifeq ($(subdir),string)
> sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
> - memmove_falkor
> + memmove_falkor memset_generic memset_nozva memset_zva
> endif
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 2cb74d5..29148ac 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -46,6 +46,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
> + IFUNC_IMPL (i, name, memset,
> + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva)
> + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64)
> + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128)
> + IFUNC_IMPL_ADD (array, i, memset, (zva_size > 0), __memset_zva_default)
> + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
>
> return i;
> }
> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> index 3af442c..541c27e 100644
> --- a/sysdeps/aarch64/multiarch/init-arch.h
> +++ b/sysdeps/aarch64/multiarch/init-arch.h
> @@ -18,6 +18,9 @@
>
> #include <ldsodefs.h>
>
> -#define INIT_ARCH() \
> - uint64_t __attribute__((unused)) midr = \
> - GLRO(dl_aarch64_cpu_features).midr_el1;
> +#define INIT_ARCH() \
> + uint64_t __attribute__((unused)) midr = \
> + GLRO(dl_aarch64_cpu_features).midr_el1; \
> + extern unsigned __aarch64_zva_size; \
> + unsigned __attribute__((unused)) zva_size = __aarch64_zva_size = \
> + GLRO(dl_aarch64_cpu_features).zva_size;
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> new file mode 100644
> index 0000000..58e669a
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -0,0 +1,47 @@
> +/* Multiple versions of memset. AARCH64 version.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +/* Define multiple versions only for the definition in libc. */
> +
> +#if IS_IN (libc)
> +/* Redefine memset so that the compiler won't complain about the type
> + mismatch with the IFUNC selector in strong_alias, below. */
> +# undef memset
> +# define memset __redirect_memset
> +# include <string.h>
> +# include <init-arch.h>
> +
> +unsigned __aarch64_zva_size;
> +
> +extern __typeof (__redirect_memset) __libc_memset;
> +
> +extern __typeof (__redirect_memset) __memset_nozva attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_default attribute_hidden;
> +
> +libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva
> + : (zva_size == 64 ? __memset_zva_64
> + : (zva_size == 128 ? __memset_zva_128
> + : __memset_zva_default))));
> +
> +# undef memset
> +strong_alias (__libc_memset, memset);
> +#else
> +#include <string/memset.c>
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
> new file mode 100644
> index 0000000..56f1e02
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_generic.S
> @@ -0,0 +1,27 @@
> +/* Memset for aarch64, default version for internal use.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#define MEMSET __memset_generic
> +#define INTERNAL_MEMSET
> +#define MEMSET_L(label) L(label)
> +#ifdef SHARED
> + .globl __GI_memset; __GI_memset = __memset_generic
> +#endif
> +
> +#include <sysdeps/aarch64/memset.S>
> diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S
> new file mode 100644
> index 0000000..98045ac
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_nozva.S
> @@ -0,0 +1,22 @@
> +/* Memset for aarch64, ZVA disabled.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#define MEMSET __memset_nozva
> +#define MEMSET_L(label) L(label)
> +#include <sysdeps/aarch64/memset.S>
> diff --git a/sysdeps/aarch64/multiarch/memset_zva.S b/sysdeps/aarch64/multiarch/memset_zva.S
> new file mode 100644
> index 0000000..5d02b89
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_zva.S
> @@ -0,0 +1,41 @@
> +/* Memset for aarch64, ZVA enabled.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#if IS_IN (libc)
> +# define MEMSET __memset_zva_64
> +# define MEMSET_ZVA 64
> +# define MEMSET_L(label) L(label ## _zva64)
> +# include <sysdeps/aarch64/memset.S>
> +
> +# undef MEMSET
> +# undef MEMSET_ZVA
> +# undef MEMSET_L
> +# define MEMSET __memset_zva_128
> +# define MEMSET_ZVA 128
> +# define MEMSET_L(label) L(label ## _zva128)
> +# include <sysdeps/aarch64/memset.S>
> +
> +# undef MEMSET
> +# undef MEMSET_ZVA
> +# undef MEMSET_L
> +# define MEMSET __memset_zva_default
> +# define MEMSET_ZVA 1
> +# define MEMSET_L(label) L(label ## _zvadef)
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> index e769eeb..092ee81 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> @@ -20,6 +20,9 @@
> #include <sys/auxv.h>
> #include <elf/dl-hwcaps.h>
>
> +#define DCZID_DZP_MASK (1 << 4)
> +#define DCZID_BS_MASK (0xf)
> +
> #if HAVE_TUNABLES
> struct cpu_list
> {
> @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features)
> }
>
> cpu_features->midr_el1 = midr;
> +
> + /* Check if ZVA is enabled. */
> + unsigned dczid;
> + asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
> +
> + if ((dczid & DCZID_DZP_MASK) == 0)
> + cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
> }
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> index 73cb53d..f2b6afd 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> @@ -47,6 +47,7 @@
> struct cpu_features
> {
> uint64_t midr_el1;
> + unsigned zva_size;
> };
>
> #endif /* _CPU_FEATURES_AARCH64_H */
>
On 19/09/2017 08:23, Siddhesh Poyarekar wrote:
> The DZP bit in the dczid_el0 register does not change dynamically, so
> it is safe to read once during program startup. Hoist the zva check
> into an ifunc resolver and store the result into a static variable,
> which can be read in case of non-standard zva sizes. This effectively
> adds 3 ifunc variants for memset - one for cases where zva is
> disabled, one for 64 byte zva and another for 128 byte zva. I have
> retained the older memset as __memset_generic for internal libc.so use
> so that the change impact is minimal. We should eventually have a
> discussion on what is more expensive, reading dczid_el0 on every
> memset invocation or the indirection due to PLT.
>
> The gains due to this are significant for falkor, with gains as high
> as 80% in some cases. Likewise for mustang, although the numbers are
> slightly lower. Here's a sample from the falkor tests:
>
I would use a more compact ChangeLog entry as:
* sysdeps/aarch64/memset.S (do_no_zva, do_zva_64,
do_zva_128, do_zva_default): New macro.
Same for the other entries where it applies.
> (MEMSET): Use the new macros.
> (MEMSET)[INTERNAL_MEMSET]: Retain old memset.
> (MEMSET)[!INTERNAL_MEMSET]: Remove zva check.
> * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
> Add memset_generic, memset_nozva and memset_zva.
> * sysdeps/aarch64/multiarch/ifunc-impl-list.c
> (__libc_ifunc_impl_list): Add memset ifuncs.
> * sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
> static variable __aarch64_zva_size and local variable
> zva_size.
> * sysdeps/aarch64/multiarch/memset.c: New file.
> * sysdeps/aarch64/multiarch/memset_generic.S: New file.
> * sysdeps/aarch64/multiarch/memset_nozva.S: New file.
> * sysdeps/aarch64/multiarch/memset_zva.S: New file.
> * sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> (DCZID_DZP_MASK): New macro.
> (DCZID_BS_MASK): Likewise.
> (init_cpu_features): Read and set zva_size.
> * sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> (struct cpu_features): New member zva_size.
>
>
> diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
> index 110fd22..8cff3a4 100644
> --- a/sysdeps/aarch64/memset.S
> +++ b/sysdeps/aarch64/memset.S
> @@ -37,7 +37,108 @@
> #define zva_len x7
> #define zva_lenw w7
>
> -ENTRY_ALIGN (__memset, 6)
> +/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128
> + bytes and higher sizes. */
> +
> +#ifndef ZVA_MACROS
> +# define ZVA_MACROS
> +/* No ZVA. */
> +.macro do_no_zva
> + sub count, dstend, dst /* Count is 16 too large. */
> + add dst, dst, 16
> + sub count, count, 64 + 16 /* Adjust count and bias for loop. */
> +1: stp q0, q0, [dst], 64
> + stp q0, q0, [dst, -32]
> + subs count, count, 64
> + b.hi 1b
> + stp q0, q0, [dstend, -64]
> + stp q0, q0, [dstend, -32]
> + ret
> +.endm
> +
> +/* Write the first and last 64 byte aligned block using stp rather
> + than using DC ZVA. This is faster on some cores. */
> +.macro do_zva_64
> + str q0, [dst, 16]
> + stp q0, q0, [dst, 32]
> + bic dst, dst, 63
> + stp q0, q0, [dst, 64]
> + stp q0, q0, [dst, 96]
> + sub count, dstend, dst /* Count is now 128 too large. */
> + sub count, count, 128+64+64 /* Adjust count and bias for loop. */
> + add dst, dst, 128
> + nop
> +1: dc zva, dst
> + add dst, dst, 64
> + subs count, count, 64
> + b.hi 1b
> + stp q0, q0, [dst, 0]
> + stp q0, q0, [dst, 32]
> + stp q0, q0, [dstend, -64]
> + stp q0, q0, [dstend, -32]
> + ret
> +.endm
> +
> +/* ZVA size of 128 bytes. */
> +.macro do_zva_128
> + str q0, [dst, 16]
> + stp q0, q0, [dst, 32]
> + stp q0, q0, [dst, 64]
> + stp q0, q0, [dst, 96]
> + bic dst, dst, 127
> + sub count, dstend, dst /* Count is now 128 too large. */
> + sub count, count, 128+128 /* Adjust count and bias for loop. */
> + add dst, dst, 128
> +1: dc zva, dst
> + add dst, dst, 128
> + subs count, count, 128
> + b.hi 1b
> + stp q0, q0, [dstend, -128]
> + stp q0, q0, [dstend, -96]
> + stp q0, q0, [dstend, -64]
> + stp q0, q0, [dstend, -32]
> + ret
> +.endm
> +
> +/* ZVA size of more than 128 bytes. */
> +.macro do_zva_default
> + add tmp1, zva_len, 64 /* Max alignment bytes written. */
> + cmp count, tmp1
> + blo MEMSET_L(no_zva)
> +
> + sub tmp2, zva_len, 1
> + add tmp1, dst, zva_len
> + add dst, dst, 16
> + subs count, tmp1, dst /* Actual alignment bytes to write. */
> + bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
> + beq 2f
> +1: stp q0, q0, [dst], 64
> + stp q0, q0, [dst, -32]
> + subs count, count, 64
> + b.hi 1b
> +2: mov dst, tmp1
> + sub count, dstend, tmp1 /* Remaining bytes to write. */
> + subs count, count, zva_len
> + b.lo 4f
> +3: dc zva, dst
> + add dst, dst, zva_len
> + subs count, count, zva_len
> + b.hs 3b
> +4: add count, count, zva_len
> + subs count, count, 64
> + b.ls 6f
> +5: stp q0, q0, [dst], 64
> + stp q0, q0, [dst, -32]
> + subs count, count, 64
> + b.hi 5b
> +6: stp q0, q0, [dstend, -64]
> + stp q0, q0, [dstend, -32]
> + ret
> +.endm
> +#endif
> +
> +/* Memset entry point. */
> +ENTRY_ALIGN (MEMSET, 6)
>
> DELOUSE (0)
> DELOUSE (2)
> @@ -46,9 +147,9 @@ ENTRY_ALIGN (__memset, 6)
> add dstend, dstin, count
>
> cmp count, 96
> - b.hi L(set_long)
> + b.hi MEMSET_L(set_long)
> cmp count, 16
> - b.hs L(set_medium)
> + b.hs MEMSET_L(set_medium)
> mov val, v0.D[0]
>
> /* Set 0..15 bytes. */
> @@ -68,9 +169,9 @@ ENTRY_ALIGN (__memset, 6)
> 3: ret
>
> /* Set 17..96 bytes. */
> -L(set_medium):
> +MEMSET_L(set_medium):
> str q0, [dstin]
> - tbnz count, 6, L(set96)
> + tbnz count, 6, MEMSET_L(set96)
> str q0, [dstend, -16]
> tbz count, 5, 1f
> str q0, [dstin, 16]
> @@ -80,7 +181,7 @@ L(set_medium):
> .p2align 4
> /* Set 64..96 bytes. Write 64 bytes from the start and
> 32 bytes from the end. */
> -L(set96):
> +MEMSET_L(set96):
> str q0, [dstin, 16]
> stp q0, q0, [dstin, 32]
> stp q0, q0, [dstend, -32]
> @@ -88,108 +189,63 @@ L(set96):
>
> .p2align 3
> nop
> -L(set_long):
> +MEMSET_L(set_long):
> +#ifdef INTERNAL_MEMSET
> and valw, valw, 255
> bic dst, dstin, 15
> str q0, [dstin]
> cmp count, 256
> ccmp valw, 0, 0, cs
> - b.eq L(try_zva)
> -L(no_zva):
> - sub count, dstend, dst /* Count is 16 too large. */
> - add dst, dst, 16
> - sub count, count, 64 + 16 /* Adjust count and bias for loop. */
> -1: stp q0, q0, [dst], 64
> - stp q0, q0, [dst, -32]
> -L(tail64):
> - subs count, count, 64
> - b.hi 1b
> -2: stp q0, q0, [dstend, -64]
> - stp q0, q0, [dstend, -32]
> - ret
> + b.eq MEMSET_L(try_zva)
>
> - .p2align 3
> -L(try_zva):
> +MEMSET_L(no_zva):
> + do_no_zva
> +
> + .p2align 4
> +MEMSET_L(try_zva):
> mrs tmp1, dczid_el0
> - tbnz tmp1w, 4, L(no_zva)
> and tmp1w, tmp1w, 15
> cmp tmp1w, 4 /* ZVA size is 64 bytes. */
> - b.ne L(zva_128)
> + b.ne MEMSET_L(zva_128)
> + do_zva_64
>
> - /* Write the first and last 64 byte aligned block using stp rather
> - than using DC ZVA. This is faster on some cores.
> - */
> -L(zva_64):
> - str q0, [dst, 16]
> - stp q0, q0, [dst, 32]
> - bic dst, dst, 63
> - stp q0, q0, [dst, 64]
> - stp q0, q0, [dst, 96]
> - sub count, dstend, dst /* Count is now 128 too large. */
> - sub count, count, 128+64+64 /* Adjust count and bias for loop. */
> - add dst, dst, 128
> - nop
> -1: dc zva, dst
> - add dst, dst, 64
> - subs count, count, 64
> - b.hi 1b
> - stp q0, q0, [dst, 0]
> - stp q0, q0, [dst, 32]
> - stp q0, q0, [dstend, -64]
> - stp q0, q0, [dstend, -32]
> - ret
> -
> - .p2align 3
> -L(zva_128):
> +MEMSET_L(zva_128):
> cmp tmp1w, 5 /* ZVA size is 128 bytes. */
> - b.ne L(zva_other)
> + b.ne MEMSET_L(zva_other)
> + do_zva_128
>
> - str q0, [dst, 16]
> - stp q0, q0, [dst, 32]
> - stp q0, q0, [dst, 64]
> - stp q0, q0, [dst, 96]
> - bic dst, dst, 127
> - sub count, dstend, dst /* Count is now 128 too large. */
> - sub count, count, 128+128 /* Adjust count and bias for loop. */
> - add dst, dst, 128
> -1: dc zva, dst
> - add dst, dst, 128
> - subs count, count, 128
> - b.hi 1b
> - stp q0, q0, [dstend, -128]
> - stp q0, q0, [dstend, -96]
> - stp q0, q0, [dstend, -64]
> - stp q0, q0, [dstend, -32]
> - ret
> -
> -L(zva_other):
> +MEMSET_L(zva_other):
> mov tmp2w, 4
> lsl zva_lenw, tmp2w, tmp1w
> - add tmp1, zva_len, 64 /* Max alignment bytes written. */
> - cmp count, tmp1
> - blo L(no_zva)
> + do_zva_default
> +#else
> + /* Memset called through PLT, so we need only one of the ZVA
> + variants. */
> +# ifdef MEMSET_ZVA
> + and valw, valw, 255
> +# endif
> + bic dst, dstin, 15
> + str q0, [dstin]
> +# ifdef MEMSET_ZVA
> + cmp count, 256
> + ccmp valw, 0, 0, cs
> + b.eq MEMSET_L(try_zva)
> +# endif
> +MEMSET_L(no_zva):
> + do_no_zva
> +# if defined MEMSET_ZVA
> +MEMSET_L(try_zva):
> +# if MEMSET_ZVA == 64
> + do_zva_64
> +# elif MEMSET_ZVA == 128
> + do_zva_128
> +# else
> + adrp zva_len, __aarch64_zva_size
> + ldr zva_len, [zva_len, #:lo12:__aarch64_zva_size]
> + do_zva_default
> +# endif
> +# endif
> +#endif
>
> - sub tmp2, zva_len, 1
> - add tmp1, dst, zva_len
> - add dst, dst, 16
> - subs count, tmp1, dst /* Actual alignment bytes to write. */
> - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
> - beq 2f
> -1: stp q0, q0, [dst], 64
> - stp q0, q0, [dst, -32]
> - subs count, count, 64
> - b.hi 1b
> -2: mov dst, tmp1
> - sub count, dstend, tmp1 /* Remaining bytes to write. */
> - subs count, count, zva_len
> - b.lo 4f
> -3: dc zva, dst
> - add dst, dst, zva_len
> - subs count, count, zva_len
> - b.hs 3b
> -4: add count, count, zva_len
> - b L(tail64)
> -
> -END (__memset)
> -weak_alias (__memset, memset)
> -libc_hidden_builtin_def (memset)
> +END (MEMSET)
> +libc_hidden_builtin_def (MEMSET)
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index 9aa1e79..f611182 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -1,4 +1,4 @@
> ifeq ($(subdir),string)
> sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
> - memmove_falkor
> + memmove_falkor memset_generic memset_nozva memset_zva
> endif
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 2cb74d5..29148ac 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -46,6 +46,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
> + IFUNC_IMPL (i, name, memset,
> + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva)
> + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64)
> + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128)
> + IFUNC_IMPL_ADD (array, i, memset, (zva_size > 0), __memset_zva_default)
> + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
>
> return i;
> }
> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> index 3af442c..541c27e 100644
> --- a/sysdeps/aarch64/multiarch/init-arch.h
> +++ b/sysdeps/aarch64/multiarch/init-arch.h
> @@ -18,6 +18,9 @@
>
> #include <ldsodefs.h>
>
> -#define INIT_ARCH() \
> - uint64_t __attribute__((unused)) midr = \
> - GLRO(dl_aarch64_cpu_features).midr_el1;
> +#define INIT_ARCH() \
> + uint64_t __attribute__((unused)) midr = \
> + GLRO(dl_aarch64_cpu_features).midr_el1; \
> + extern unsigned __aarch64_zva_size; \
> + unsigned __attribute__((unused)) zva_size = __aarch64_zva_size = \
> + GLRO(dl_aarch64_cpu_features).zva_size;
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> new file mode 100644
> index 0000000..58e669a
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -0,0 +1,47 @@
> +/* Multiple versions of memset. AARCH64 version.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +/* Define multiple versions only for the definition in libc. */
> +
> +#if IS_IN (libc)
> +/* Redefine memset so that the compiler won't complain about the type
> + mismatch with the IFUNC selector in strong_alias, below. */
> +# undef memset
> +# define memset __redirect_memset
> +# include <string.h>
> +# include <init-arch.h>
> +
> +unsigned __aarch64_zva_size;
> +
> +extern __typeof (__redirect_memset) __libc_memset;
> +
> +extern __typeof (__redirect_memset) __memset_nozva attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_default attribute_hidden;
> +
> +libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva
> + : (zva_size == 64 ? __memset_zva_64
> + : (zva_size == 128 ? __memset_zva_128
> + : __memset_zva_default))));
> +
> +# undef memset
> +strong_alias (__libc_memset, memset);
> +#else
> +#include <string/memset.c>
> +#endif
You don't need use the default version for the loader, you can use the
generic sysdeps/aarch64/memset.S by creating a rtld-memset.S on
multiarch and defining the required macros.
> diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
> new file mode 100644
> index 0000000..56f1e02
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_generic.S
> @@ -0,0 +1,27 @@
> +/* Memset for aarch64, default version for internal use.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#define MEMSET __memset_generic
> +#define INTERNAL_MEMSET
> +#define MEMSET_L(label) L(label)
> +#ifdef SHARED
> + .globl __GI_memset; __GI_memset = __memset_generic
> +#endif
I would add a comment stating it is essentially doing libc_hidden_def(memset)
and redirecting the internal implementation to __memset_generic.
> +
> +#include <sysdeps/aarch64/memset.S>
> diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S
> new file mode 100644
> index 0000000..98045ac
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_nozva.S
> @@ -0,0 +1,22 @@
> +/* Memset for aarch64, ZVA disabled.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#define MEMSET __memset_nozva
> +#define MEMSET_L(label) L(label)
> +#include <sysdeps/aarch64/memset.S>
Although not strictly required, I think it should avoid build these
for !IS_IN(libc) as for memset_zva.S. Same applied for memset_generic.S.
> diff --git a/sysdeps/aarch64/multiarch/memset_zva.S b/sysdeps/aarch64/multiarch/memset_zva.S
> new file mode 100644
> index 0000000..5d02b89
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_zva.S
> @@ -0,0 +1,41 @@
> +/* Memset for aarch64, ZVA enabled.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#if IS_IN (libc)
> +# define MEMSET __memset_zva_64
> +# define MEMSET_ZVA 64
> +# define MEMSET_L(label) L(label ## _zva64)
> +# include <sysdeps/aarch64/memset.S>
> +
> +# undef MEMSET
> +# undef MEMSET_ZVA
> +# undef MEMSET_L
> +# define MEMSET __memset_zva_128
> +# define MEMSET_ZVA 128
> +# define MEMSET_L(label) L(label ## _zva128)
> +# include <sysdeps/aarch64/memset.S>
> +
> +# undef MEMSET
> +# undef MEMSET_ZVA
> +# undef MEMSET_L
> +# define MEMSET __memset_zva_default
> +# define MEMSET_ZVA 1
> +# define MEMSET_L(label) L(label ## _zvadef)
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> index e769eeb..092ee81 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> @@ -20,6 +20,9 @@
> #include <sys/auxv.h>
> #include <elf/dl-hwcaps.h>
>
> +#define DCZID_DZP_MASK (1 << 4)
> +#define DCZID_BS_MASK (0xf)
> +
> #if HAVE_TUNABLES
> struct cpu_list
> {
> @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features)
> }
>
> cpu_features->midr_el1 = midr;
> +
> + /* Check if ZVA is enabled. */
> + unsigned dczid;
> + asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
> +
> + if ((dczid & DCZID_DZP_MASK) == 0)
> + cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
> }
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> index 73cb53d..f2b6afd 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> @@ -47,6 +47,7 @@
> struct cpu_features
> {
> uint64_t midr_el1;
> + unsigned zva_size;
> };
>
> #endif /* _CPU_FEATURES_AARCH64_H */
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
On Friday 29 September 2017 12:42 AM, Adhemerval Zanella wrote:
> I would use a more compact ChangeLog entry as:
>
> * sysdeps/aarch64/memset.S (do_no_zva, do_zva_64,
> do_zva_128, do_zva_default): New macro.
>
> Same for the other entries where it applies.
Done.
> You don't need use the default version for the loader, you can use the
> generic sysdeps/aarch64/memset.S by creating a rtld-memset.S on
> multiarch and defining the required macros.
Done.
> I would add a comment stating it is essentially doing libc_hidden_def(memset)
> and redirecting the internal implementation to __memset_generic.
Done.
> Although not strictly required, I think it should avoid build these
> for !IS_IN(libc) as for memset_zva.S. Same applied for memset_generic.S.
Done.
Posting updated patch shortly.
Thanks,
Siddhesh
========================================================================================================================
length=256, char=0: 1.82 (-87.26%) 26.99 ( 89.30%) 25.49 ( 78.76%) 23.48 ( 64.65%) 14.26
length=257, char=0: 1.82 (-87.29%) 26.97 ( 88.44%) 25.77 ( 80.12%) 24.41 ( 70.57%) 14.31
length=258, char=0: 1.82 (-87.38%) 26.27 ( 82.29%) 25.84 ( 79.28%) 24.33 ( 68.80%) 14.41
length=259, char=0: 1.82 (-87.36%) 26.06 ( 81.15%) 25.72 ( 78.84%) 24.57 ( 70.80%) 14.38
length=260, char=0: 1.82 (-87.44%) 25.35 ( 75.23%) 25.93 ( 79.23%) 24.34 ( 68.24%) 14.47
length=261, char=0: 1.82 (-87.49%) 26.15 ( 79.70%) 26.01 ( 78.72%) 24.44 ( 67.97%) 14.55
length=262, char=0: 1.82 (-87.54%) 25.91 ( 77.31%) 26.06 ( 78.35%) 24.33 ( 66.49%) 14.61
length=263, char=0: 1.82 (-87.54%) 25.69 ( 75.80%) 25.96 ( 77.63%) 24.54 ( 67.90%) 14.61
length=264, char=0: 1.82 (-87.57%) 25.31 ( 72.69%) 26.16 ( 78.43%) 24.63 ( 68.00%) 14.66
length=265, char=0: 1.82 (-87.65%) 25.29 ( 71.35%) 26.25 ( 77.84%) 24.58 ( 66.53%) 14.76
length=266, char=0: 1.82 (-87.69%) 25.10 ( 69.40%) 26.15 ( 76.48%) 24.77 ( 67.22%) 14.82
length=267, char=0: 1.82 (-87.69%) 24.89 ( 68.02%) 26.20 ( 76.90%) 24.87 ( 67.91%) 14.81
length=268, char=0: 1.82 (-87.74%) 24.07 ( 62.04%) 26.40 ( 77.74%) 24.95 ( 67.93%) 14.85
length=269, char=0: 1.82 (-87.80%) 23.82 ( 59.29%) 26.47 ( 77.00%) 24.89 ( 66.43%) 14.96
length=270, char=0: 1.82 (-87.84%) 23.65 ( 57.61%) 26.35 ( 75.58%) 25.07 ( 67.07%) 15.01
length=271, char=0: 1.83 (-87.82%) 23.48 ( 56.53%) 26.39 ( 75.93%) 25.15 ( 67.66%) 15.00
length=512, char=0: 1.90 (-92.59%) 29.25 ( 13.81%) 36.30 ( 41.27%) 40.95 ( 59.36%) 25.70
length=513, char=0: 1.90 (-92.57%) 29.29 ( 14.35%) 36.63 ( 43.01%) 40.80 ( 59.28%) 25.61
length=514, char=0: 1.90 (-92.62%) 28.61 ( 10.91%) 36.64 ( 42.05%) 40.89 ( 58.52%) 25.80
length=515, char=0: 1.90 (-92.63%) 28.74 ( 11.29%) 36.68 ( 42.06%) 40.56 ( 57.08%) 25.82
length=516, char=0: 1.90 (-92.65%) 28.33 ( 9.54%) 36.72 ( 41.96%) 40.09 ( 55.01%) 25.87
length=517, char=0: 1.90 (-92.66%) 28.41 ( 9.60%) 36.80 ( 41.97%) 39.43 ( 52.13%) 25.92
length=518, char=0: 1.90 (-92.66%) 28.16 ( 8.45%) 36.84 ( 41.89%) 39.40 ( 51.77%) 25.96
length=519, char=0: 1.90 (-92.67%) 28.21 ( 8.58%) 36.86 ( 41.86%) 40.39 ( 55.46%) 25.98
length=520, char=0: 1.90 (-92.65%) 27.53 ( 6.32%) 36.90 ( 42.49%) 40.80 ( 57.58%) 25.89
length=521, char=0: 1.90 (-92.69%) 27.53 ( 5.65%) 36.61 ( 40.50%) 40.86 ( 56.81%) 26.05
length=522, char=0: 1.90 (-92.66%) 27.40 ( 5.59%) 36.95 ( 42.35%) 40.92 ( 57.64%) 25.95
length=523, char=0: 1.91 (-92.71%) 27.50 ( 5.29%) 36.69 ( 40.45%) 40.97 ( 56.84%) 26.12
length=524, char=0: 1.90 (-92.69%) 27.33 ( 5.02%) 37.00 ( 42.18%) 40.98 ( 57.50%) 26.02
length=525, char=0: 1.91 (-92.72%) 27.24 ( 4.04%) 36.70 ( 40.13%) 41.04 ( 56.72%) 26.19
length=526, char=0: 1.90 (-92.70%) 27.06 ( 3.73%) 37.06 ( 42.05%) 41.08 ( 57.44%) 26.09
length=527, char=0: 1.91 (-92.74%) 26.82 ( 2.17%) 37.06 ( 41.17%) 41.11 ( 56.62%) 26.25
length=1024, char=0: 1.95 (-95.35%) 30.55 (-27.12%) 46.52 ( 10.99%) 49.89 ( 19.02%) 41.91
length=1025, char=0: 1.95 (-95.31%) 30.58 (-26.47%) 46.57 ( 11.98%) 49.92 ( 20.05%) 41.59
length=1026, char=0: 1.95 (-95.36%) 30.35 (-27.70%) 46.56 ( 10.92%) 49.45 ( 17.79%) 41.98
length=1027, char=0: 1.95 (-95.36%) 30.24 (-28.02%) 46.20 ( 9.98%) 49.93 ( 18.88%) 42.00
length=1028, char=0: 1.95 (-95.37%) 29.75 (-29.25%) 46.58 ( 10.76%) 49.92 ( 18.71%) 42.05
length=1029, char=0: 1.95 (-95.37%) 29.78 (-29.24%) 46.57 ( 10.65%) 49.96 ( 18.72%) 42.08
length=1030, char=0: 1.95 (-95.33%) 29.77 (-28.73%) 46.63 ( 11.63%) 49.97 ( 19.64%) 41.77
length=1031, char=0: 1.95 (-95.37%) 29.64 (-29.68%) 46.62 ( 10.59%) 49.51 ( 17.46%) 42.15
length=1032, char=0: 1.95 (-95.38%) 29.60 (-29.80%) 46.22 ( 9.63%) 49.99 ( 18.58%) 42.16
length=1033, char=0: 1.95 (-95.38%) 29.32 (-30.55%) 46.65 ( 10.49%) 49.95 ( 18.32%) 42.22
length=1034, char=0: 1.95 (-95.39%) 29.45 (-30.31%) 46.67 ( 10.44%) 50.01 ( 18.36%) 42.25
length=1035, char=0: 1.95 (-95.35%) 29.31 (-30.09%) 46.68 ( 11.34%) 50.02 ( 19.31%) 41.92
length=1036, char=0: 1.95 (-95.40%) 29.30 (-30.75%) 46.66 ( 10.27%) 49.56 ( 17.12%) 42.32
length=1037, char=0: 1.95 (-95.39%) 29.17 (-31.08%) 46.30 ( 9.38%) 50.04 ( 18.22%) 42.33
length=1038, char=0: 1.95 (-95.40%) 29.12 (-31.30%) 46.71 ( 10.19%) 50.02 ( 18.01%) 42.39
length=1039, char=0: 1.95 (-95.40%) 29.19 (-31.20%) 46.73 ( 10.14%) 50.06 ( 18.00%) 42.43
* sysdeps/aarch64/memset.S (do_no_zva): New macro.
(do_zva_64): Likewise.
(do_zva_128): Likewise.
(do_zva_default): Likewise.
(__memset): Rename to MEMSET macro.
(MEMSET): Use the new macros.
(MEMSET)[INTERNAL_MEMSET]: Retain old memset.
(MEMSET)[!INTERNAL_MEMSET]: Remove zva check.
* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
Add memset_generic, memset_nozva and memset_zva.
* sysdeps/aarch64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add memset ifuncs.
* sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
static variable __aarch64_zva_size and local variable
zva_size.
* sysdeps/aarch64/multiarch/memset.c: New file.
* sysdeps/aarch64/multiarch/memset_generic.S: New file.
* sysdeps/aarch64/multiarch/memset_nozva.S: New file.
* sysdeps/aarch64/multiarch/memset_zva.S: New file.
* sysdeps/unix/sysv/linux/aarch64/cpu-features.c
(DCZID_DZP_MASK): New macro.
(DCZID_BS_MASK): Likewise.
(init_cpu_features): Read and set zva_size.
* sysdeps/unix/sysv/linux/aarch64/cpu-features.h
(struct cpu_features): New member zva_size.
---
sysdeps/aarch64/memset.S | 248 +++++++++++++++----------
sysdeps/aarch64/multiarch/Makefile | 2 +-
sysdeps/aarch64/multiarch/ifunc-impl-list.c | 6 +
sysdeps/aarch64/multiarch/init-arch.h | 9 +-
sysdeps/aarch64/multiarch/memset.c | 47 +++++
sysdeps/aarch64/multiarch/memset_generic.S | 27 +++
sysdeps/aarch64/multiarch/memset_nozva.S | 22 +++
sysdeps/aarch64/multiarch/memset_zva.S | 41 ++++
sysdeps/unix/sysv/linux/aarch64/cpu-features.c | 10 +
sysdeps/unix/sysv/linux/aarch64/cpu-features.h | 1 +
10 files changed, 313 insertions(+), 100 deletions(-)
create mode 100644 sysdeps/aarch64/multiarch/memset.c
create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S
create mode 100644 sysdeps/aarch64/multiarch/memset_nozva.S
create mode 100644 sysdeps/aarch64/multiarch/memset_zva.S
@@ -37,7 +37,108 @@
#define zva_len x7
#define zva_lenw w7
-ENTRY_ALIGN (__memset, 6)
+/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128
+ bytes and higher sizes. */
+
+#ifndef ZVA_MACROS
+# define ZVA_MACROS
+/* No ZVA. */
+.macro do_no_zva
+ sub count, dstend, dst /* Count is 16 too large. */
+ add dst, dst, 16
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+ subs count, count, 64
+ b.hi 1b
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+.endm
+
+/* Write the first and last 64 byte aligned block using stp rather
+ than using DC ZVA. This is faster on some cores. */
+.macro do_zva_64
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ bic dst, dst, 63
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+64+64 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+ nop
+1: dc zva, dst
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi 1b
+ stp q0, q0, [dst, 0]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+.endm
+
+/* ZVA size of 128 bytes. */
+.macro do_zva_128
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ bic dst, dst, 127
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+128 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+1: dc zva, dst
+ add dst, dst, 128
+ subs count, count, 128
+ b.hi 1b
+ stp q0, q0, [dstend, -128]
+ stp q0, q0, [dstend, -96]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+.endm
+
+/* ZVA size of more than 128 bytes. */
+.macro do_zva_default
+ add tmp1, zva_len, 64 /* Max alignment bytes written. */
+ cmp count, tmp1
+ blo MEMSET_L(no_zva)
+
+ sub tmp2, zva_len, 1
+ add tmp1, dst, zva_len
+ add dst, dst, 16
+ subs count, tmp1, dst /* Actual alignment bytes to write. */
+ bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
+ beq 2f
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+ subs count, count, 64
+ b.hi 1b
+2: mov dst, tmp1
+ sub count, dstend, tmp1 /* Remaining bytes to write. */
+ subs count, count, zva_len
+ b.lo 4f
+3: dc zva, dst
+ add dst, dst, zva_len
+ subs count, count, zva_len
+ b.hs 3b
+4: add count, count, zva_len
+ subs count, count, 64
+ b.ls 6f
+5: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+ subs count, count, 64
+ b.hi 5b
+6: stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+.endm
+#endif
+
+/* Memset entry point. */
+ENTRY_ALIGN (MEMSET, 6)
DELOUSE (0)
DELOUSE (2)
@@ -46,9 +147,9 @@ ENTRY_ALIGN (__memset, 6)
add dstend, dstin, count
cmp count, 96
- b.hi L(set_long)
+ b.hi MEMSET_L(set_long)
cmp count, 16
- b.hs L(set_medium)
+ b.hs MEMSET_L(set_medium)
mov val, v0.D[0]
/* Set 0..15 bytes. */
@@ -68,9 +169,9 @@ ENTRY_ALIGN (__memset, 6)
3: ret
/* Set 17..96 bytes. */
-L(set_medium):
+MEMSET_L(set_medium):
str q0, [dstin]
- tbnz count, 6, L(set96)
+ tbnz count, 6, MEMSET_L(set96)
str q0, [dstend, -16]
tbz count, 5, 1f
str q0, [dstin, 16]
@@ -80,7 +181,7 @@ L(set_medium):
.p2align 4
/* Set 64..96 bytes. Write 64 bytes from the start and
32 bytes from the end. */
-L(set96):
+MEMSET_L(set96):
str q0, [dstin, 16]
stp q0, q0, [dstin, 32]
stp q0, q0, [dstend, -32]
@@ -88,108 +189,63 @@ L(set96):
.p2align 3
nop
-L(set_long):
+MEMSET_L(set_long):
+#ifdef INTERNAL_MEMSET
and valw, valw, 255
bic dst, dstin, 15
str q0, [dstin]
cmp count, 256
ccmp valw, 0, 0, cs
- b.eq L(try_zva)
-L(no_zva):
- sub count, dstend, dst /* Count is 16 too large. */
- add dst, dst, 16
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
-1: stp q0, q0, [dst], 64
- stp q0, q0, [dst, -32]
-L(tail64):
- subs count, count, 64
- b.hi 1b
-2: stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
+ b.eq MEMSET_L(try_zva)
- .p2align 3
-L(try_zva):
+MEMSET_L(no_zva):
+ do_no_zva
+
+ .p2align 4
+MEMSET_L(try_zva):
mrs tmp1, dczid_el0
- tbnz tmp1w, 4, L(no_zva)
and tmp1w, tmp1w, 15
cmp tmp1w, 4 /* ZVA size is 64 bytes. */
- b.ne L(zva_128)
+ b.ne MEMSET_L(zva_128)
+ do_zva_64
- /* Write the first and last 64 byte aligned block using stp rather
- than using DC ZVA. This is faster on some cores.
- */
-L(zva_64):
- str q0, [dst, 16]
- stp q0, q0, [dst, 32]
- bic dst, dst, 63
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
- add dst, dst, 128
- nop
-1: dc zva, dst
- add dst, dst, 64
- subs count, count, 64
- b.hi 1b
- stp q0, q0, [dst, 0]
- stp q0, q0, [dst, 32]
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
-
- .p2align 3
-L(zva_128):
+MEMSET_L(zva_128):
cmp tmp1w, 5 /* ZVA size is 128 bytes. */
- b.ne L(zva_other)
+ b.ne MEMSET_L(zva_other)
+ do_zva_128
- str q0, [dst, 16]
- stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- bic dst, dst, 127
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+128 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
- add dst, dst, 128
- subs count, count, 128
- b.hi 1b
- stp q0, q0, [dstend, -128]
- stp q0, q0, [dstend, -96]
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
-
-L(zva_other):
+MEMSET_L(zva_other):
mov tmp2w, 4
lsl zva_lenw, tmp2w, tmp1w
- add tmp1, zva_len, 64 /* Max alignment bytes written. */
- cmp count, tmp1
- blo L(no_zva)
+ do_zva_default
+#else
+ /* Memset called through PLT, so we need only one of the ZVA
+ variants. */
+# ifdef MEMSET_ZVA
+ and valw, valw, 255
+# endif
+ bic dst, dstin, 15
+ str q0, [dstin]
+# ifdef MEMSET_ZVA
+ cmp count, 256
+ ccmp valw, 0, 0, cs
+ b.eq MEMSET_L(try_zva)
+# endif
+MEMSET_L(no_zva):
+ do_no_zva
+# if defined MEMSET_ZVA
+MEMSET_L(try_zva):
+# if MEMSET_ZVA == 64
+ do_zva_64
+# elif MEMSET_ZVA == 128
+ do_zva_128
+# else
+ adrp zva_len, __aarch64_zva_size
+ ldr zva_len, [zva_len, #:lo12:__aarch64_zva_size]
+ do_zva_default
+# endif
+# endif
+#endif
- sub tmp2, zva_len, 1
- add tmp1, dst, zva_len
- add dst, dst, 16
- subs count, tmp1, dst /* Actual alignment bytes to write. */
- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
- beq 2f
-1: stp q0, q0, [dst], 64
- stp q0, q0, [dst, -32]
- subs count, count, 64
- b.hi 1b
-2: mov dst, tmp1
- sub count, dstend, tmp1 /* Remaining bytes to write. */
- subs count, count, zva_len
- b.lo 4f
-3: dc zva, dst
- add dst, dst, zva_len
- subs count, count, zva_len
- b.hs 3b
-4: add count, count, zva_len
- b L(tail64)
-
-END (__memset)
-weak_alias (__memset, memset)
-libc_hidden_builtin_def (memset)
+END (MEMSET)
+libc_hidden_builtin_def (MEMSET)
@@ -1,4 +1,4 @@
ifeq ($(subdir),string)
sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
- memmove_falkor
+ memmove_falkor memset_generic memset_nozva memset_zva
endif
@@ -46,6 +46,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+ IFUNC_IMPL (i, name, memset,
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva)
+ IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64)
+ IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128)
+ IFUNC_IMPL_ADD (array, i, memset, (zva_size > 0), __memset_zva_default)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
return i;
}
@@ -18,6 +18,9 @@
#include <ldsodefs.h>
-#define INIT_ARCH() \
- uint64_t __attribute__((unused)) midr = \
- GLRO(dl_aarch64_cpu_features).midr_el1;
+#define INIT_ARCH() \
+ uint64_t __attribute__((unused)) midr = \
+ GLRO(dl_aarch64_cpu_features).midr_el1; \
+ extern unsigned __aarch64_zva_size; \
+ unsigned __attribute__((unused)) zva_size = __aarch64_zva_size = \
+ GLRO(dl_aarch64_cpu_features).zva_size;
new file mode 100644
@@ -0,0 +1,47 @@
+/* Multiple versions of memset. AARCH64 version.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+/* Redefine memset so that the compiler won't complain about the type
+ mismatch with the IFUNC selector in strong_alias, below. */
+# undef memset
+# define memset __redirect_memset
+# include <string.h>
+# include <init-arch.h>
+
+unsigned __aarch64_zva_size;
+
+extern __typeof (__redirect_memset) __libc_memset;
+
+extern __typeof (__redirect_memset) __memset_nozva attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva_default attribute_hidden;
+
+libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva
+ : (zva_size == 64 ? __memset_zva_64
+ : (zva_size == 128 ? __memset_zva_128
+ : __memset_zva_default))));
+
+# undef memset
+strong_alias (__libc_memset, memset);
+#else
+#include <string/memset.c>
+#endif
new file mode 100644
@@ -0,0 +1,27 @@
+/* Memset for aarch64, default version for internal use.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define MEMSET __memset_generic
+#define INTERNAL_MEMSET
+#define MEMSET_L(label) L(label)
+#ifdef SHARED
+ .globl __GI_memset; __GI_memset = __memset_generic
+#endif
+
+#include <sysdeps/aarch64/memset.S>
new file mode 100644
@@ -0,0 +1,22 @@
+/* Memset for aarch64, ZVA disabled.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define MEMSET __memset_nozva
+#define MEMSET_L(label) L(label)
+#include <sysdeps/aarch64/memset.S>
new file mode 100644
@@ -0,0 +1,41 @@
+/* Memset for aarch64, ZVA enabled.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define MEMSET __memset_zva_64
+# define MEMSET_ZVA 64
+# define MEMSET_L(label) L(label ## _zva64)
+# include <sysdeps/aarch64/memset.S>
+
+# undef MEMSET
+# undef MEMSET_ZVA
+# undef MEMSET_L
+# define MEMSET __memset_zva_128
+# define MEMSET_ZVA 128
+# define MEMSET_L(label) L(label ## _zva128)
+# include <sysdeps/aarch64/memset.S>
+
+# undef MEMSET
+# undef MEMSET_ZVA
+# undef MEMSET_L
+# define MEMSET __memset_zva_default
+# define MEMSET_ZVA 1
+# define MEMSET_L(label) L(label ## _zvadef)
+# include <sysdeps/aarch64/memset.S>
+#endif
@@ -20,6 +20,9 @@
#include <sys/auxv.h>
#include <elf/dl-hwcaps.h>
+#define DCZID_DZP_MASK (1 << 4)
+#define DCZID_BS_MASK (0xf)
+
#if HAVE_TUNABLES
struct cpu_list
{
@@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features)
}
cpu_features->midr_el1 = midr;
+
+ /* Check if ZVA is enabled. */
+ unsigned dczid;
+ asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
+
+ if ((dczid & DCZID_DZP_MASK) == 0)
+ cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
}
@@ -47,6 +47,7 @@
struct cpu_features
{
uint64_t midr_el1;
+ unsigned zva_size;
};
#endif /* _CPU_FEATURES_AARCH64_H */