[1/3] AArch64: Cleanup emag memset
Checks
Commit Message
Cleanup emag memset - merge the memset_base64.S file, remove the ZVA code.
OK for commit?
---
Comments
On 10/11/23 14:33, Wilco Dijkstra wrote:
>
> Cleanup emag memset - merge the memset_base64.S file, remove the ZVA code.
>
> OK for commit?
I think it would be good to add on commit message that a zva 64 optimization
path will be added on generic implementation, along with an optimized zva 64
for eMAG (so the ifunc seletion will still use zva optimized memset).
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
>
> ---
>
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 836e8317a5d3b652134d199cf685499983b1a3fc..3596d3c8d3403b4ea07d80d9a8877e2908a9883e 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -57,7 +57,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> /* Enable this on non-falkor processors too so that other cores
> can do a comparative analysis with __memset_generic. */
> IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
> - IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_emag)
> + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
> IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
> #if HAVE_AARCH64_SVE_ASM
> IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 256, __memset_a64fx)
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> index 23fc66e15879847557b0e4f6941f03bc7ac5cab9..9193b197ddc3a647768184a6a639d6635cfea96e 100644
> --- a/sysdeps/aarch64/multiarch/memset.c
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -56,7 +56,7 @@ select_memset_ifunc (void)
> if ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64)
> return __memset_falkor;
>
> - if (IS_EMAG (midr) && zva_size == 64)
> + if (IS_EMAG (midr))
> return __memset_emag;
>
> return __memset_generic;
> diff --git a/sysdeps/aarch64/multiarch/memset_base64.S b/sysdeps/aarch64/multiarch/memset_base64.S
> deleted file mode 100644
> index 0e8f709fa58478d6e9d62020c576bb9be108866c..0000000000000000000000000000000000000000
> --- a/sysdeps/aarch64/multiarch/memset_base64.S
> +++ /dev/null
> @@ -1,185 +0,0 @@
> -/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
> -
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library. If not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -#include "memset-reg.h"
> -
> -#ifndef MEMSET
> -# define MEMSET __memset_base64
> -#endif
> -
> -/* To disable DC ZVA, set this threshold to 0. */
> -#ifndef DC_ZVA_THRESHOLD
> -# define DC_ZVA_THRESHOLD 512
> -#endif
> -
> -/* Assumptions:
> - *
> - * ARMv8-a, AArch64, unaligned accesses
> - *
> - */
> -
> -ENTRY (MEMSET)
> -
> - PTR_ARG (0)
> - SIZE_ARG (2)
> -
> - bfi valw, valw, 8, 8
> - bfi valw, valw, 16, 16
> - bfi val, val, 32, 32
> -
> - add dstend, dstin, count
> -
> - cmp count, 96
> - b.hi L(set_long)
> - cmp count, 16
> - b.hs L(set_medium)
> -
> - /* Set 0..15 bytes. */
> - tbz count, 3, 1f
> - str val, [dstin]
> - str val, [dstend, -8]
> - ret
> -
> - .p2align 3
> -1: tbz count, 2, 2f
> - str valw, [dstin]
> - str valw, [dstend, -4]
> - ret
> -2: cbz count, 3f
> - strb valw, [dstin]
> - tbz count, 1, 3f
> - strh valw, [dstend, -2]
> -3: ret
> -
> - .p2align 3
> - /* Set 16..96 bytes. */
> -L(set_medium):
> - stp val, val, [dstin]
> - tbnz count, 6, L(set96)
> - stp val, val, [dstend, -16]
> - tbz count, 5, 1f
> - stp val, val, [dstin, 16]
> - stp val, val, [dstend, -32]
> -1: ret
> -
> - .p2align 4
> - /* Set 64..96 bytes. Write 64 bytes from the start and
> - 32 bytes from the end. */
> -L(set96):
> - stp val, val, [dstin, 16]
> - stp val, val, [dstin, 32]
> - stp val, val, [dstin, 48]
> - stp val, val, [dstend, -32]
> - stp val, val, [dstend, -16]
> - ret
> -
> - .p2align 4
> -L(set_long):
> - stp val, val, [dstin]
> - bic dst, dstin, 15
> -#if DC_ZVA_THRESHOLD
> - cmp count, DC_ZVA_THRESHOLD
> - ccmp val, 0, 0, cs
> - b.eq L(zva_64)
> -#endif
> - /* Small-size or non-zero memset does not use DC ZVA. */
> - sub count, dstend, dst
> -
> - /*
> - * Adjust count and bias for loop. By subtracting extra 1 from count,
> - * it is easy to use tbz instruction to check whether loop tailing
> - * count is less than 33 bytes, so as to bypass 2 unnecessary stps.
> - */
> - sub count, count, 64+16+1
> -
> -#if DC_ZVA_THRESHOLD
> - /* Align loop on 16-byte boundary, this might be friendly to i-cache. */
> - nop
> -#endif
> -
> -1: stp val, val, [dst, 16]
> - stp val, val, [dst, 32]
> - stp val, val, [dst, 48]
> - stp val, val, [dst, 64]!
> - subs count, count, 64
> - b.hs 1b
> -
> - tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
> - stp val, val, [dst, 16]
> - stp val, val, [dst, 32]
> -1: stp val, val, [dstend, -32]
> - stp val, val, [dstend, -16]
> - ret
> -
> -#if DC_ZVA_THRESHOLD
> - .p2align 3
> -L(zva_64):
> - stp val, val, [dst, 16]
> - stp val, val, [dst, 32]
> - stp val, val, [dst, 48]
> - bic dst, dst, 63
> -
> - /*
> - * Previous memory writes might cross cache line boundary, and cause
> - * cache line partially dirty. Zeroing this kind of cache line using
> - * DC ZVA will incur extra cost, for it requires loading untouched
> - * part of the line from memory before zeoring.
> - *
> - * So, write the first 64 byte aligned block using stp to force
> - * fully dirty cache line.
> - */
> - stp val, val, [dst, 64]
> - stp val, val, [dst, 80]
> - stp val, val, [dst, 96]
> - stp val, val, [dst, 112]
> -
> - sub count, dstend, dst
> - /*
> - * Adjust count and bias for loop. By subtracting extra 1 from count,
> - * it is easy to use tbz instruction to check whether loop tailing
> - * count is less than 33 bytes, so as to bypass 2 unnecessary stps.
> - */
> - sub count, count, 128+64+64+1
> - add dst, dst, 128
> - nop
> -
> - /* DC ZVA sets 64 bytes each time. */
> -1: dc zva, dst
> - add dst, dst, 64
> - subs count, count, 64
> - b.hs 1b
> -
> - /*
> - * Write the last 64 byte aligned block using stp to force fully
> - * dirty cache line.
> - */
> - stp val, val, [dst, 0]
> - stp val, val, [dst, 16]
> - stp val, val, [dst, 32]
> - stp val, val, [dst, 48]
> -
> - tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
> - stp val, val, [dst, 64]
> - stp val, val, [dst, 80]
> -1: stp val, val, [dstend, -32]
> - stp val, val, [dstend, -16]
> - ret
> -#endif
> -
> -END (MEMSET)
> diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
> index 6fecad4fae699f9967da94ddc88867afd5c59414..bbfa815925899149e2313a9317380fa9fd089abd 100644
> --- a/sysdeps/aarch64/multiarch/memset_emag.S
> +++ b/sysdeps/aarch64/multiarch/memset_emag.S
> @@ -18,17 +18,95 @@
> <https://www.gnu.org/licenses/>. */
>
> #include <sysdep.h>
> +#include "memset-reg.h"
>
> -#define MEMSET __memset_emag
> -
> -/*
> - * Using DC ZVA to zero memory does not produce better performance if
> - * memory size is not very large, especially when there are multiple
> - * processes/threads contending memory/cache. Here we set threshold to
> - * zero to disable using DC ZVA, which is good for multi-process/thread
> - * workloads.
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64, unaligned accesses
> + *
> */
>
> -#define DC_ZVA_THRESHOLD 0
> +ENTRY (__memset_emag)
> +
> + PTR_ARG (0)
> + SIZE_ARG (2)
> +
> + bfi valw, valw, 8, 8
> + bfi valw, valw, 16, 16
> + bfi val, val, 32, 32
> +
> + add dstend, dstin, count
> +
> + cmp count, 96
> + b.hi L(set_long)
> + cmp count, 16
> + b.hs L(set_medium)
> +
> + /* Set 0..15 bytes. */
> + tbz count, 3, 1f
> + str val, [dstin]
> + str val, [dstend, -8]
> + ret
> +
> + .p2align 3
> +1: tbz count, 2, 2f
> + str valw, [dstin]
> + str valw, [dstend, -4]
> + ret
> +2: cbz count, 3f
> + strb valw, [dstin]
> + tbz count, 1, 3f
> + strh valw, [dstend, -2]
> +3: ret
> +
> + .p2align 3
> + /* Set 16..96 bytes. */
> +L(set_medium):
> + stp val, val, [dstin]
> + tbnz count, 6, L(set96)
> + stp val, val, [dstend, -16]
> + tbz count, 5, 1f
> + stp val, val, [dstin, 16]
> + stp val, val, [dstend, -32]
> +1: ret
> +
> + .p2align 4
> + /* Set 64..96 bytes. Write 64 bytes from the start and
> + 32 bytes from the end. */
> +L(set96):
> + stp val, val, [dstin, 16]
> + stp val, val, [dstin, 32]
> + stp val, val, [dstin, 48]
> + stp val, val, [dstend, -32]
> + stp val, val, [dstend, -16]
> + ret
> +
> + .p2align 4
> +L(set_long):
> + stp val, val, [dstin]
> + bic dst, dstin, 15
> + /* Small-size or non-zero memset does not use DC ZVA. */
> + sub count, dstend, dst
> +
> + /*
> + * Adjust count and bias for loop. By subtracting extra 1 from count,
> + * it is easy to use tbz instruction to check whether loop tailing
> + * count is less than 33 bytes, so as to bypass 2 unnecessary stps.
> + */
> + sub count, count, 64+16+1
> +
> +1: stp val, val, [dst, 16]
> + stp val, val, [dst, 32]
> + stp val, val, [dst, 48]
> + stp val, val, [dst, 64]!
> + subs count, count, 64
> + b.hs 1b
> +
> + tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
> + stp val, val, [dst, 16]
> + stp val, val, [dst, 32]
> +1: stp val, val, [dstend, -32]
> + stp val, val, [dstend, -16]
> + ret
>
> -#include "./memset_base64.S"
> +END (__memset_emag)
>
@@ -57,7 +57,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Enable this on non-falkor processors too so that other cores
can do a comparative analysis with __memset_generic. */
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
- IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_emag)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
#if HAVE_AARCH64_SVE_ASM
IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 256, __memset_a64fx)
@@ -56,7 +56,7 @@ select_memset_ifunc (void)
if ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64)
return __memset_falkor;
- if (IS_EMAG (midr) && zva_size == 64)
+ if (IS_EMAG (midr))
return __memset_emag;
return __memset_generic;
deleted file mode 100644
@@ -1,185 +0,0 @@
-/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
-
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library. If not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include "memset-reg.h"
-
-#ifndef MEMSET
-# define MEMSET __memset_base64
-#endif
-
-/* To disable DC ZVA, set this threshold to 0. */
-#ifndef DC_ZVA_THRESHOLD
-# define DC_ZVA_THRESHOLD 512
-#endif
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, unaligned accesses
- *
- */
-
-ENTRY (MEMSET)
-
- PTR_ARG (0)
- SIZE_ARG (2)
-
- bfi valw, valw, 8, 8
- bfi valw, valw, 16, 16
- bfi val, val, 32, 32
-
- add dstend, dstin, count
-
- cmp count, 96
- b.hi L(set_long)
- cmp count, 16
- b.hs L(set_medium)
-
- /* Set 0..15 bytes. */
- tbz count, 3, 1f
- str val, [dstin]
- str val, [dstend, -8]
- ret
-
- .p2align 3
-1: tbz count, 2, 2f
- str valw, [dstin]
- str valw, [dstend, -4]
- ret
-2: cbz count, 3f
- strb valw, [dstin]
- tbz count, 1, 3f
- strh valw, [dstend, -2]
-3: ret
-
- .p2align 3
- /* Set 16..96 bytes. */
-L(set_medium):
- stp val, val, [dstin]
- tbnz count, 6, L(set96)
- stp val, val, [dstend, -16]
- tbz count, 5, 1f
- stp val, val, [dstin, 16]
- stp val, val, [dstend, -32]
-1: ret
-
- .p2align 4
- /* Set 64..96 bytes. Write 64 bytes from the start and
- 32 bytes from the end. */
-L(set96):
- stp val, val, [dstin, 16]
- stp val, val, [dstin, 32]
- stp val, val, [dstin, 48]
- stp val, val, [dstend, -32]
- stp val, val, [dstend, -16]
- ret
-
- .p2align 4
-L(set_long):
- stp val, val, [dstin]
- bic dst, dstin, 15
-#if DC_ZVA_THRESHOLD
- cmp count, DC_ZVA_THRESHOLD
- ccmp val, 0, 0, cs
- b.eq L(zva_64)
-#endif
- /* Small-size or non-zero memset does not use DC ZVA. */
- sub count, dstend, dst
-
- /*
- * Adjust count and bias for loop. By subtracting extra 1 from count,
- * it is easy to use tbz instruction to check whether loop tailing
- * count is less than 33 bytes, so as to bypass 2 unnecessary stps.
- */
- sub count, count, 64+16+1
-
-#if DC_ZVA_THRESHOLD
- /* Align loop on 16-byte boundary, this might be friendly to i-cache. */
- nop
-#endif
-
-1: stp val, val, [dst, 16]
- stp val, val, [dst, 32]
- stp val, val, [dst, 48]
- stp val, val, [dst, 64]!
- subs count, count, 64
- b.hs 1b
-
- tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
- stp val, val, [dst, 16]
- stp val, val, [dst, 32]
-1: stp val, val, [dstend, -32]
- stp val, val, [dstend, -16]
- ret
-
-#if DC_ZVA_THRESHOLD
- .p2align 3
-L(zva_64):
- stp val, val, [dst, 16]
- stp val, val, [dst, 32]
- stp val, val, [dst, 48]
- bic dst, dst, 63
-
- /*
- * Previous memory writes might cross cache line boundary, and cause
- * cache line partially dirty. Zeroing this kind of cache line using
- * DC ZVA will incur extra cost, for it requires loading untouched
- * part of the line from memory before zeoring.
- *
- * So, write the first 64 byte aligned block using stp to force
- * fully dirty cache line.
- */
- stp val, val, [dst, 64]
- stp val, val, [dst, 80]
- stp val, val, [dst, 96]
- stp val, val, [dst, 112]
-
- sub count, dstend, dst
- /*
- * Adjust count and bias for loop. By subtracting extra 1 from count,
- * it is easy to use tbz instruction to check whether loop tailing
- * count is less than 33 bytes, so as to bypass 2 unnecessary stps.
- */
- sub count, count, 128+64+64+1
- add dst, dst, 128
- nop
-
- /* DC ZVA sets 64 bytes each time. */
-1: dc zva, dst
- add dst, dst, 64
- subs count, count, 64
- b.hs 1b
-
- /*
- * Write the last 64 byte aligned block using stp to force fully
- * dirty cache line.
- */
- stp val, val, [dst, 0]
- stp val, val, [dst, 16]
- stp val, val, [dst, 32]
- stp val, val, [dst, 48]
-
- tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
- stp val, val, [dst, 64]
- stp val, val, [dst, 80]
-1: stp val, val, [dstend, -32]
- stp val, val, [dstend, -16]
- ret
-#endif
-
-END (MEMSET)
@@ -18,17 +18,95 @@
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
+#include "memset-reg.h"
-#define MEMSET __memset_emag
-
-/*
- * Using DC ZVA to zero memory does not produce better performance if
- * memory size is not very large, especially when there are multiple
- * processes/threads contending memory/cache. Here we set threshold to
- * zero to disable using DC ZVA, which is good for multi-process/thread
- * workloads.
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ *
*/
-#define DC_ZVA_THRESHOLD 0
+ENTRY (__memset_emag)
+
+ PTR_ARG (0)
+ SIZE_ARG (2)
+
+ bfi valw, valw, 8, 8
+ bfi valw, valw, 16, 16
+ bfi val, val, 32, 32
+
+ add dstend, dstin, count
+
+ cmp count, 96
+ b.hi L(set_long)
+ cmp count, 16
+ b.hs L(set_medium)
+
+ /* Set 0..15 bytes. */
+ tbz count, 3, 1f
+ str val, [dstin]
+ str val, [dstend, -8]
+ ret
+
+ .p2align 3
+1: tbz count, 2, 2f
+ str valw, [dstin]
+ str valw, [dstend, -4]
+ ret
+2: cbz count, 3f
+ strb valw, [dstin]
+ tbz count, 1, 3f
+ strh valw, [dstend, -2]
+3: ret
+
+ .p2align 3
+ /* Set 16..96 bytes. */
+L(set_medium):
+ stp val, val, [dstin]
+ tbnz count, 6, L(set96)
+ stp val, val, [dstend, -16]
+ tbz count, 5, 1f
+ stp val, val, [dstin, 16]
+ stp val, val, [dstend, -32]
+1: ret
+
+ .p2align 4
+ /* Set 64..96 bytes. Write 64 bytes from the start and
+ 32 bytes from the end. */
+L(set96):
+ stp val, val, [dstin, 16]
+ stp val, val, [dstin, 32]
+ stp val, val, [dstin, 48]
+ stp val, val, [dstend, -32]
+ stp val, val, [dstend, -16]
+ ret
+
+ .p2align 4
+L(set_long):
+ stp val, val, [dstin]
+ bic dst, dstin, 15
+ /* Small-size or non-zero memset does not use DC ZVA. */
+ sub count, dstend, dst
+
+ /*
+ * Adjust count and bias for loop. By subtracting extra 1 from count,
+ * it is easy to use tbz instruction to check whether loop tailing
+ * count is less than 33 bytes, so as to bypass 2 unnecessary stps.
+ */
+ sub count, count, 64+16+1
+
+1: stp val, val, [dst, 16]
+ stp val, val, [dst, 32]
+ stp val, val, [dst, 48]
+ stp val, val, [dst, 64]!
+ subs count, count, 64
+ b.hs 1b
+
+ tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
+ stp val, val, [dst, 16]
+ stp val, val, [dst, 32]
+1: stp val, val, [dstend, -32]
+ stp val, val, [dstend, -16]
+ ret
-#include "./memset_base64.S"
+END (__memset_emag)