[2/3] AArch64: Add memset_zva64
Checks
Commit Message
Add a specialized memset for the common ZVA size of 64. Since the code is
identical to __memset_falkor, remove the latter.
OK for commit?
---
Comments
On 10/11/23 14:35, Wilco Dijkstra wrote:
>
> Add a specialized memset for the common ZVA size of 64. Since the code is
> identical to __memset_falkor, remove the latter.
>
> OK for commit?
LGTM, thanks.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
>
> ---
>
> diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
> index bf3cf85c8a95fd8c03ae13c4173fe507040ee8cd..bbfb7184c3e4277f59178ccf4f9b92814dd7a48d 100644
> --- a/sysdeps/aarch64/memset.S
> +++ b/sysdeps/aarch64/memset.S
> @@ -101,19 +101,19 @@ L(tail64):
> ret
>
> L(try_zva):
> -#ifdef ZVA_MACRO
> - zva_macro
> -#else
> +#ifndef ZVA64_ONLY
> .p2align 3
> mrs tmp1, dczid_el0
> tbnz tmp1w, 4, L(no_zva)
> and tmp1w, tmp1w, 15
> cmp tmp1w, 4 /* ZVA size is 64 bytes. */
> b.ne L(zva_128)
> -
> + nop
> +#endif
> /* Write the first and last 64 byte aligned block using stp rather
> than using DC ZVA. This is faster on some cores.
> */
> + .p2align 4
> L(zva_64):
> str q0, [dst, 16]
> stp q0, q0, [dst, 32]
> @@ -123,7 +123,6 @@ L(zva_64):
> sub count, dstend, dst /* Count is now 128 too large. */
> sub count, count, 128+64+64 /* Adjust count and bias for loop. */
> add dst, dst, 128
> - nop
> 1: dc zva, dst
> add dst, dst, 64
> subs count, count, 64
> @@ -134,6 +133,7 @@ L(zva_64):
> stp q0, q0, [dstend, -32]
> ret
>
> +#ifndef ZVA64_ONLY
> .p2align 3
> L(zva_128):
> cmp tmp1w, 5 /* ZVA size is 128 bytes. */
Ok.
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index a1a4de3cd93c48db6e47eebc9c111186efca53fb..171ca5e4cf9a87fc7df5896f21c2e5b94ea218ba 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -12,10 +12,10 @@ sysdep_routines += \
> memmove_mops \
> memset_a64fx \
> memset_emag \
> - memset_falkor \
> memset_generic \
> memset_kunpeng \
> memset_mops \
> + memset_zva64 \
> strlen_asimd \
> strlen_generic \
> # sysdep_routines
Ok.
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 3596d3c8d3403b4ea07d80d9a8877e2908a9883e..fdd9ea92463123df213dec27f6f0598f8ce54d6e 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -54,9 +54,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, memmove, mops, __memmove_mops)
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
> IFUNC_IMPL (i, name, memset,
> - /* Enable this on non-falkor processors too so that other cores
> - can do a comparative analysis with __memset_generic. */
> - IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
> + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva64)
> IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
> IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
Ok.
> #if HAVE_AARCH64_SVE_ASM
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> index 9193b197ddc3a647768184a6a639d6635cfea96e..6deb6865e5154f129922dca673cf069f72f46d79 100644
> --- a/sysdeps/aarch64/multiarch/memset.c
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -28,7 +28,7 @@
>
> extern __typeof (__redirect_memset) __libc_memset;
>
> -extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva64 attribute_hidden;
> extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
> extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
> extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
> @@ -47,18 +47,17 @@ select_memset_ifunc (void)
> {
> if (IS_A64FX (midr) && zva_size == 256)
> return __memset_a64fx;
> - return __memset_generic;
> }
>
> if (IS_KUNPENG920 (midr))
> return __memset_kunpeng;
>
> - if ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64)
> - return __memset_falkor;
> -
> if (IS_EMAG (midr))
> return __memset_emag;
>
> + if (zva_size == 64)
> + return __memset_zva64;
> +
> return __memset_generic;
> }
>
Ok.
> diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S
> deleted file mode 100644
> index c6946a8072ce60099f9c3da0cf4ca54785e6a520..0000000000000000000000000000000000000000
> --- a/sysdeps/aarch64/multiarch/memset_falkor.S
> +++ /dev/null
> @@ -1,54 +0,0 @@
> -/* Memset for falkor.
> - Copyright (C) 2017-2023 Free Software Foundation, Inc.
> -
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library. If not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -#include <memset-reg.h>
> -
> -/* Reading dczid_el0 is expensive on falkor so move it into the ifunc
> - resolver and assume ZVA size of 64 bytes. The IFUNC resolver takes care to
> - use this function only when ZVA is enabled. */
> -
> -#if IS_IN (libc)
> -.macro zva_macro
> - .p2align 4
> - /* Write the first and last 64 byte aligned block using stp rather
> - than using DC ZVA. This is faster on some cores. */
> - str q0, [dst, 16]
> - stp q0, q0, [dst, 32]
> - bic dst, dst, 63
> - stp q0, q0, [dst, 64]
> - stp q0, q0, [dst, 96]
> - sub count, dstend, dst /* Count is now 128 too large. */
> - sub count, count, 128+64+64 /* Adjust count and bias for loop. */
> - add dst, dst, 128
> -1: dc zva, dst
> - add dst, dst, 64
> - subs count, count, 64
> - b.hi 1b
> - stp q0, q0, [dst, 0]
> - stp q0, q0, [dst, 32]
> - stp q0, q0, [dstend, -64]
> - stp q0, q0, [dstend, -32]
> - ret
> -.endm
> -
> -# define ZVA_MACRO zva_macro
> -# define MEMSET __memset_falkor
> -# include <sysdeps/aarch64/memset.S>
> -#endif
> diff --git a/sysdeps/aarch64/multiarch/memset_zva64.S b/sysdeps/aarch64/multiarch/memset_zva64.S
> new file mode 100644
> index 0000000000000000000000000000000000000000..13f45fd3d882c756f18a1679d758e2eb688f9c3d
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_zva64.S
> @@ -0,0 +1,27 @@
> +/* Optimized memset for zva size = 64.
> + Copyright (C) 2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +#define ZVA64_ONLY 1
> +#define MEMSET __memset_zva64
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(X)
> +
> +#include "../memset.S"
>
Ok.
@@ -101,19 +101,19 @@ L(tail64):
ret
L(try_zva):
-#ifdef ZVA_MACRO
- zva_macro
-#else
+#ifndef ZVA64_ONLY
.p2align 3
mrs tmp1, dczid_el0
tbnz tmp1w, 4, L(no_zva)
and tmp1w, tmp1w, 15
cmp tmp1w, 4 /* ZVA size is 64 bytes. */
b.ne L(zva_128)
-
+ nop
+#endif
/* Write the first and last 64 byte aligned block using stp rather
than using DC ZVA. This is faster on some cores.
*/
+ .p2align 4
L(zva_64):
str q0, [dst, 16]
stp q0, q0, [dst, 32]
@@ -123,7 +123,6 @@ L(zva_64):
sub count, dstend, dst /* Count is now 128 too large. */
sub count, count, 128+64+64 /* Adjust count and bias for loop. */
add dst, dst, 128
- nop
1: dc zva, dst
add dst, dst, 64
subs count, count, 64
@@ -134,6 +133,7 @@ L(zva_64):
stp q0, q0, [dstend, -32]
ret
+#ifndef ZVA64_ONLY
.p2align 3
L(zva_128):
cmp tmp1w, 5 /* ZVA size is 128 bytes. */
@@ -12,10 +12,10 @@ sysdep_routines += \
memmove_mops \
memset_a64fx \
memset_emag \
- memset_falkor \
memset_generic \
memset_kunpeng \
memset_mops \
+ memset_zva64 \
strlen_asimd \
strlen_generic \
# sysdep_routines
@@ -54,9 +54,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove, mops, __memmove_mops)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
IFUNC_IMPL (i, name, memset,
- /* Enable this on non-falkor processors too so that other cores
- can do a comparative analysis with __memset_generic. */
- IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
+ IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva64)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
#if HAVE_AARCH64_SVE_ASM
@@ -28,7 +28,7 @@
extern __typeof (__redirect_memset) __libc_memset;
-extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva64 attribute_hidden;
extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
@@ -47,18 +47,17 @@ select_memset_ifunc (void)
{
if (IS_A64FX (midr) && zva_size == 256)
return __memset_a64fx;
- return __memset_generic;
}
if (IS_KUNPENG920 (midr))
return __memset_kunpeng;
- if ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64)
- return __memset_falkor;
-
if (IS_EMAG (midr))
return __memset_emag;
+ if (zva_size == 64)
+ return __memset_zva64;
+
return __memset_generic;
}
deleted file mode 100644
@@ -1,54 +0,0 @@
-/* Memset for falkor.
- Copyright (C) 2017-2023 Free Software Foundation, Inc.
-
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library. If not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <memset-reg.h>
-
-/* Reading dczid_el0 is expensive on falkor so move it into the ifunc
- resolver and assume ZVA size of 64 bytes. The IFUNC resolver takes care to
- use this function only when ZVA is enabled. */
-
-#if IS_IN (libc)
-.macro zva_macro
- .p2align 4
- /* Write the first and last 64 byte aligned block using stp rather
- than using DC ZVA. This is faster on some cores. */
- str q0, [dst, 16]
- stp q0, q0, [dst, 32]
- bic dst, dst, 63
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
- add dst, dst, 64
- subs count, count, 64
- b.hi 1b
- stp q0, q0, [dst, 0]
- stp q0, q0, [dst, 32]
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
-.endm
-
-# define ZVA_MACRO zva_macro
-# define MEMSET __memset_falkor
-# include <sysdeps/aarch64/memset.S>
-#endif
new file mode 100644
@@ -0,0 +1,27 @@
+/* Optimized memset for zva size = 64.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define ZVA64_ONLY 1
+#define MEMSET __memset_zva64
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(X)
+
+#include "../memset.S"