[2/3] AArch64: Add memset_zva64

Message ID PAWPR08MB89822C10A1EEF917E99C85FC83AEA@PAWPR08MB8982.eurprd08.prod.outlook.com
State Committed
Commit 3d7090f14b13312320e425b27dcf0fe72de026fd
Headers
Series [1/3] AArch64: Cleanup emag memset |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-arm fail Patch failed to apply
linaro-tcwg-bot/tcwg_glibc_check--master-arm fail Patch failed to apply
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 fail Patch failed to apply
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 fail Patch failed to apply

Commit Message

Wilco Dijkstra Nov. 10, 2023, 5:35 p.m. UTC
  Add a specialized memset for the common ZVA size of 64.  Since the code is
identical to __memset_falkor, remove the latter.

OK for commit?

---
  

Comments

Adhemerval Zanella Netto Nov. 13, 2023, 4:08 p.m. UTC | #1
On 10/11/23 14:35, Wilco Dijkstra wrote:
> 
> Add a specialized memset for the common ZVA size of 64.  Since the code is
> identical to __memset_falkor, remove the latter.
> 
> OK for commit?

LGTM, thanks.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

> 
> ---
> 
> diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
> index bf3cf85c8a95fd8c03ae13c4173fe507040ee8cd..bbfb7184c3e4277f59178ccf4f9b92814dd7a48d 100644
> --- a/sysdeps/aarch64/memset.S
> +++ b/sysdeps/aarch64/memset.S
> @@ -101,19 +101,19 @@ L(tail64):
>  	ret
>  
>  L(try_zva):
> -#ifdef ZVA_MACRO
> -	zva_macro
> -#else
> +#ifndef ZVA64_ONLY
>  	.p2align 3
>  	mrs	tmp1, dczid_el0
>  	tbnz	tmp1w, 4, L(no_zva)
>  	and	tmp1w, tmp1w, 15
>  	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
>  	b.ne	 L(zva_128)
> -
> +	nop
> +#endif
>  	/* Write the first and last 64 byte aligned block using stp rather
>  	   than using DC ZVA.  This is faster on some cores.
>  	 */
> +	.p2align 4
>  L(zva_64):
>  	str	q0, [dst, 16]
>  	stp	q0, q0, [dst, 32]
> @@ -123,7 +123,6 @@ L(zva_64):
>  	sub	count, dstend, dst	/* Count is now 128 too large.	*/
>  	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
>  	add	dst, dst, 128
> -	nop
>  1:	dc	zva, dst
>  	add	dst, dst, 64
>  	subs	count, count, 64
> @@ -134,6 +133,7 @@ L(zva_64):
>  	stp	q0, q0, [dstend, -32]
>  	ret
>  
> +#ifndef ZVA64_ONLY
>  	.p2align 3
>  L(zva_128):
>  	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */

Ok.

> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index a1a4de3cd93c48db6e47eebc9c111186efca53fb..171ca5e4cf9a87fc7df5896f21c2e5b94ea218ba 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -12,10 +12,10 @@ sysdep_routines += \
>    memmove_mops \
>    memset_a64fx \
>    memset_emag \
> -  memset_falkor \
>    memset_generic \
>    memset_kunpeng \
>    memset_mops \
> +  memset_zva64 \
>    strlen_asimd \
>    strlen_generic \
>  # sysdep_routines

Ok.

> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 3596d3c8d3403b4ea07d80d9a8877e2908a9883e..fdd9ea92463123df213dec27f6f0598f8ce54d6e 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -54,9 +54,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  	      IFUNC_IMPL_ADD (array, i, memmove, mops, __memmove_mops)
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
>    IFUNC_IMPL (i, name, memset,
> -	      /* Enable this on non-falkor processors too so that other cores
> -		 can do a comparative analysis with __memset_generic.  */
> -	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
> +	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva64)
>  	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
>  	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)

Ok.

>  #if HAVE_AARCH64_SVE_ASM
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> index 9193b197ddc3a647768184a6a639d6635cfea96e..6deb6865e5154f129922dca673cf069f72f46d79 100644
> --- a/sysdeps/aarch64/multiarch/memset.c
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -28,7 +28,7 @@
>  
>  extern __typeof (__redirect_memset) __libc_memset;
>  
> -extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva64 attribute_hidden;
>  extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
>  extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
>  extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
> @@ -47,18 +47,17 @@ select_memset_ifunc (void)
>      {
>        if (IS_A64FX (midr) && zva_size == 256)
>  	return __memset_a64fx;
> -      return __memset_generic;
>      }
>  
>    if (IS_KUNPENG920 (midr))
>      return __memset_kunpeng;
>  
> -  if ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64)
> -    return __memset_falkor;
> -
>    if (IS_EMAG (midr))
>      return __memset_emag;
>  
> +  if (zva_size == 64)
> +    return __memset_zva64;
> +
>    return __memset_generic;
>  }
>  

Ok.

> diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S
> deleted file mode 100644
> index c6946a8072ce60099f9c3da0cf4ca54785e6a520..0000000000000000000000000000000000000000
> --- a/sysdeps/aarch64/multiarch/memset_falkor.S
> +++ /dev/null
> @@ -1,54 +0,0 @@
> -/* Memset for falkor.
> -   Copyright (C) 2017-2023 Free Software Foundation, Inc.
> -
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library.  If not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -#include <memset-reg.h>
> -
> -/* Reading dczid_el0 is expensive on falkor so move it into the ifunc
> -   resolver and assume ZVA size of 64 bytes.  The IFUNC resolver takes care to
> -   use this function only when ZVA is enabled.  */
> -
> -#if IS_IN (libc)
> -.macro zva_macro
> -	.p2align 4
> -	/* Write the first and last 64 byte aligned block using stp rather
> -	   than using DC ZVA.  This is faster on some cores.  */
> -	str	q0, [dst, 16]
> -	stp	q0, q0, [dst, 32]
> -	bic	dst, dst, 63
> -	stp	q0, q0, [dst, 64]
> -	stp	q0, q0, [dst, 96]
> -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> -	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
> -	add	dst, dst, 128
> -1:	dc	zva, dst
> -	add	dst, dst, 64
> -	subs	count, count, 64
> -	b.hi	1b
> -	stp	q0, q0, [dst, 0]
> -	stp	q0, q0, [dst, 32]
> -	stp	q0, q0, [dstend, -64]
> -	stp	q0, q0, [dstend, -32]
> -	ret
> -.endm
> -
> -# define ZVA_MACRO zva_macro
> -# define MEMSET __memset_falkor
> -# include <sysdeps/aarch64/memset.S>
> -#endif
> diff --git a/sysdeps/aarch64/multiarch/memset_zva64.S b/sysdeps/aarch64/multiarch/memset_zva64.S
> new file mode 100644
> index 0000000000000000000000000000000000000000..13f45fd3d882c756f18a1679d758e2eb688f9c3d
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_zva64.S
> @@ -0,0 +1,27 @@
> +/* Optimized memset for zva size = 64.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#define ZVA64_ONLY 1
> +#define MEMSET __memset_zva64
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(X)
> +
> +#include "../memset.S"
> 

Ok.
  

Patch

diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index bf3cf85c8a95fd8c03ae13c4173fe507040ee8cd..bbfb7184c3e4277f59178ccf4f9b92814dd7a48d 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -101,19 +101,19 @@  L(tail64):
 	ret
 
 L(try_zva):
-#ifdef ZVA_MACRO
-	zva_macro
-#else
+#ifndef ZVA64_ONLY
 	.p2align 3
 	mrs	tmp1, dczid_el0
 	tbnz	tmp1w, 4, L(no_zva)
 	and	tmp1w, tmp1w, 15
 	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
 	b.ne	 L(zva_128)
-
+	nop
+#endif
 	/* Write the first and last 64 byte aligned block using stp rather
 	   than using DC ZVA.  This is faster on some cores.
 	 */
+	.p2align 4
 L(zva_64):
 	str	q0, [dst, 16]
 	stp	q0, q0, [dst, 32]
@@ -123,7 +123,6 @@  L(zva_64):
 	sub	count, dstend, dst	/* Count is now 128 too large.	*/
 	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
 	add	dst, dst, 128
-	nop
 1:	dc	zva, dst
 	add	dst, dst, 64
 	subs	count, count, 64
@@ -134,6 +133,7 @@  L(zva_64):
 	stp	q0, q0, [dstend, -32]
 	ret
 
+#ifndef ZVA64_ONLY
 	.p2align 3
 L(zva_128):
 	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index a1a4de3cd93c48db6e47eebc9c111186efca53fb..171ca5e4cf9a87fc7df5896f21c2e5b94ea218ba 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -12,10 +12,10 @@  sysdep_routines += \
   memmove_mops \
   memset_a64fx \
   memset_emag \
-  memset_falkor \
   memset_generic \
   memset_kunpeng \
   memset_mops \
+  memset_zva64 \
   strlen_asimd \
   strlen_generic \
 # sysdep_routines
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 3596d3c8d3403b4ea07d80d9a8877e2908a9883e..fdd9ea92463123df213dec27f6f0598f8ce54d6e 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -54,9 +54,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove, mops, __memmove_mops)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
   IFUNC_IMPL (i, name, memset,
-	      /* Enable this on non-falkor processors too so that other cores
-		 can do a comparative analysis with __memset_generic.  */
-	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
+	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva64)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
 #if HAVE_AARCH64_SVE_ASM
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 9193b197ddc3a647768184a6a639d6635cfea96e..6deb6865e5154f129922dca673cf069f72f46d79 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -28,7 +28,7 @@ 
 
 extern __typeof (__redirect_memset) __libc_memset;
 
-extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
+extern __typeof (__redirect_memset) __memset_zva64 attribute_hidden;
 extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
 extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
 extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
@@ -47,18 +47,17 @@  select_memset_ifunc (void)
     {
       if (IS_A64FX (midr) && zva_size == 256)
 	return __memset_a64fx;
-      return __memset_generic;
     }
 
   if (IS_KUNPENG920 (midr))
     return __memset_kunpeng;
 
-  if ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64)
-    return __memset_falkor;
-
   if (IS_EMAG (midr))
     return __memset_emag;
 
+  if (zva_size == 64)
+    return __memset_zva64;
+
   return __memset_generic;
 }
 
diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S
deleted file mode 100644
index c6946a8072ce60099f9c3da0cf4ca54785e6a520..0000000000000000000000000000000000000000
--- a/sysdeps/aarch64/multiarch/memset_falkor.S
+++ /dev/null
@@ -1,54 +0,0 @@ 
-/* Memset for falkor.
-   Copyright (C) 2017-2023 Free Software Foundation, Inc.
-
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library.  If not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <memset-reg.h>
-
-/* Reading dczid_el0 is expensive on falkor so move it into the ifunc
-   resolver and assume ZVA size of 64 bytes.  The IFUNC resolver takes care to
-   use this function only when ZVA is enabled.  */
-
-#if IS_IN (libc)
-.macro zva_macro
-	.p2align 4
-	/* Write the first and last 64 byte aligned block using stp rather
-	   than using DC ZVA.  This is faster on some cores.  */
-	str	q0, [dst, 16]
-	stp	q0, q0, [dst, 32]
-	bic	dst, dst, 63
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
-	add	dst, dst, 64
-	subs	count, count, 64
-	b.hi	1b
-	stp	q0, q0, [dst, 0]
-	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
-.endm
-
-# define ZVA_MACRO zva_macro
-# define MEMSET __memset_falkor
-# include <sysdeps/aarch64/memset.S>
-#endif
diff --git a/sysdeps/aarch64/multiarch/memset_zva64.S b/sysdeps/aarch64/multiarch/memset_zva64.S
new file mode 100644
index 0000000000000000000000000000000000000000..13f45fd3d882c756f18a1679d758e2eb688f9c3d
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_zva64.S
@@ -0,0 +1,27 @@ 
+/* Optimized memset for zva size = 64.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define ZVA64_ONLY 1
+#define MEMSET __memset_zva64
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(X)
+
+#include "../memset.S"