[1/3] AArch64: Cleanup emag memset

Message ID PAWPR08MB8982D5D207425242ABE581BE83AEA@PAWPR08MB8982.eurprd08.prod.outlook.com
State Committed
Commit 9627ab99b50d250c6dd3001a3355aa03692f7fe5
Headers
Series [1/3] AArch64: Cleanup emag memset |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-arm fail Patch failed to apply
linaro-tcwg-bot/tcwg_glibc_check--master-arm fail Patch failed to apply
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 fail Patch failed to apply
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 fail Patch failed to apply

Commit Message

Wilco Dijkstra Nov. 10, 2023, 5:33 p.m. UTC
  Cleanup emag memset - merge the memset_base64.S file, remove the ZVA code.

OK for commit?

---
  

Comments

Adhemerval Zanella Netto Nov. 13, 2023, 4:05 p.m. UTC | #1
On 10/11/23 14:33, Wilco Dijkstra wrote:
> 
> Cleanup emag memset - merge the memset_base64.S file, remove the ZVA code.
> 
> OK for commit?

I think it would be good to add on commit message that a zva 64 optimization
path will be added on generic implementation, along with an optimized zva 64
for eMAG (so the ifunc seletion will still use zva optimized memset).

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

> 
> ---
> 
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 836e8317a5d3b652134d199cf685499983b1a3fc..3596d3c8d3403b4ea07d80d9a8877e2908a9883e 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -57,7 +57,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  	      /* Enable this on non-falkor processors too so that other cores
>  		 can do a comparative analysis with __memset_generic.  */
>  	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
> -	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_emag)
> +	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
>  	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
>  #if HAVE_AARCH64_SVE_ASM
>  	      IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 256, __memset_a64fx)
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> index 23fc66e15879847557b0e4f6941f03bc7ac5cab9..9193b197ddc3a647768184a6a639d6635cfea96e 100644
> --- a/sysdeps/aarch64/multiarch/memset.c
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -56,7 +56,7 @@ select_memset_ifunc (void)
>    if ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64)
>      return __memset_falkor;
>  
> -  if (IS_EMAG (midr) && zva_size == 64)
> +  if (IS_EMAG (midr))
>      return __memset_emag;
>  
>    return __memset_generic;
> diff --git a/sysdeps/aarch64/multiarch/memset_base64.S b/sysdeps/aarch64/multiarch/memset_base64.S
> deleted file mode 100644
> index 0e8f709fa58478d6e9d62020c576bb9be108866c..0000000000000000000000000000000000000000
> --- a/sysdeps/aarch64/multiarch/memset_base64.S
> +++ /dev/null
> @@ -1,185 +0,0 @@
> -/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
> -
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library.  If not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -#include "memset-reg.h"
> -
> -#ifndef MEMSET
> -# define MEMSET __memset_base64
> -#endif
> -
> -/* To disable DC ZVA, set this threshold to 0. */
> -#ifndef DC_ZVA_THRESHOLD
> -# define DC_ZVA_THRESHOLD 512
> -#endif
> -
> -/* Assumptions:
> - *
> - * ARMv8-a, AArch64, unaligned accesses
> - *
> - */
> -
> -ENTRY (MEMSET)
> -
> -	PTR_ARG (0)
> -	SIZE_ARG (2)
> -
> -	bfi	valw, valw, 8, 8
> -	bfi	valw, valw, 16, 16
> -	bfi	val, val, 32, 32
> -
> -	add	dstend, dstin, count
> -
> -	cmp	count, 96
> -	b.hi	L(set_long)
> -	cmp	count, 16
> -	b.hs	L(set_medium)
> -
> -	/* Set 0..15 bytes.  */
> -	tbz	count, 3, 1f
> -	str	val, [dstin]
> -	str	val, [dstend, -8]
> -	ret
> -
> -	.p2align 3
> -1:	tbz	count, 2, 2f
> -	str	valw, [dstin]
> -	str	valw, [dstend, -4]
> -	ret
> -2:	cbz	count, 3f
> -	strb	valw, [dstin]
> -	tbz	count, 1, 3f
> -	strh	valw, [dstend, -2]
> -3:	ret
> -
> -	.p2align 3
> -	/* Set 16..96 bytes.  */
> -L(set_medium):
> -	stp	val, val, [dstin]
> -	tbnz	count, 6, L(set96)
> -	stp	val, val, [dstend, -16]
> -	tbz	count, 5, 1f
> -	stp	val, val, [dstin, 16]
> -	stp	val, val, [dstend, -32]
> -1:	ret
> -
> -	.p2align 4
> -	/* Set 64..96 bytes.  Write 64 bytes from the start and
> -	   32 bytes from the end.  */
> -L(set96):
> -	stp	val, val, [dstin, 16]
> -	stp	val, val, [dstin, 32]
> -	stp	val, val, [dstin, 48]
> -	stp	val, val, [dstend, -32]
> -	stp	val, val, [dstend, -16]
> -	ret
> -
> -	.p2align 4
> -L(set_long):
> -	stp	val, val, [dstin]
> -	bic	dst, dstin, 15
> -#if DC_ZVA_THRESHOLD
> -	cmp	count, DC_ZVA_THRESHOLD
> -	ccmp	val, 0, 0, cs
> -	b.eq	L(zva_64)
> -#endif
> -	/* Small-size or non-zero memset does not use DC ZVA. */
> -	sub	count, dstend, dst
> -
> -	/*
> -	 * Adjust count and bias for loop. By subtracting extra 1 from count,
> -	 * it is easy to use tbz instruction to check whether loop tailing
> -	 * count is less than 33 bytes, so as to bypass 2 unnecessary stps.
> -	 */
> -	sub	count, count, 64+16+1
> -
> -#if DC_ZVA_THRESHOLD
> -	/* Align loop on 16-byte boundary, this might be friendly to i-cache. */
> -	nop
> -#endif
> -
> -1:	stp	val, val, [dst, 16]
> -	stp	val, val, [dst, 32]
> -	stp	val, val, [dst, 48]
> -	stp	val, val, [dst, 64]!
> -	subs	count, count, 64
> -	b.hs	1b
> -
> -	tbz	count, 5, 1f	/* Remaining count is less than 33 bytes? */
> -	stp	val, val, [dst, 16]
> -	stp	val, val, [dst, 32]
> -1:	stp	val, val, [dstend, -32]
> -	stp	val, val, [dstend, -16]
> -	ret
> -
> -#if DC_ZVA_THRESHOLD
> -	.p2align 3
> -L(zva_64):
> -	stp	val, val, [dst, 16]
> -	stp	val, val, [dst, 32]
> -	stp	val, val, [dst, 48]
> -	bic	dst, dst, 63
> -
> -	/*
> -	 * Previous memory writes might cross cache line boundary, and cause
> -	 * cache line partially dirty. Zeroing this kind of cache line using
> -	 * DC ZVA will incur extra cost, for it requires loading untouched
> -	 * part of the line from memory before zeoring.
> -	 *
> -	 * So, write the first 64 byte aligned block using stp to force
> -	 * fully dirty cache line.
> -	 */
> -	stp	val, val, [dst, 64]
> -	stp	val, val, [dst, 80]
> -	stp	val, val, [dst, 96]
> -	stp	val, val, [dst, 112]
> -
> -	sub	count, dstend, dst
> -	/*
> -	 * Adjust count and bias for loop. By subtracting extra 1 from count,
> -	 * it is easy to use tbz instruction to check whether loop tailing
> -	 * count is less than 33 bytes, so as to bypass 2 unnecessary stps.
> -	 */
> -	sub	count, count, 128+64+64+1
> -	add	dst, dst, 128
> -	nop
> -
> -	/* DC ZVA sets 64 bytes each time. */
> -1:	dc	zva, dst
> -	add	dst, dst, 64
> -	subs	count, count, 64
> -	b.hs	1b
> -
> -	/*
> -	 * Write the last 64 byte aligned block using stp to force fully
> -	 * dirty cache line.
> -	 */
> -	stp	val, val, [dst, 0]
> -	stp	val, val, [dst, 16]
> -	stp	val, val, [dst, 32]
> -	stp	val, val, [dst, 48]
> -
> -	tbz	count, 5, 1f	/* Remaining count is less than 33 bytes? */
> -	stp	val, val, [dst, 64]
> -	stp	val, val, [dst, 80]
> -1:	stp	val, val, [dstend, -32]
> -	stp	val, val, [dstend, -16]
> -	ret
> -#endif
> -
> -END (MEMSET)
> diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
> index 6fecad4fae699f9967da94ddc88867afd5c59414..bbfa815925899149e2313a9317380fa9fd089abd 100644
> --- a/sysdeps/aarch64/multiarch/memset_emag.S
> +++ b/sysdeps/aarch64/multiarch/memset_emag.S
> @@ -18,17 +18,95 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  #include <sysdep.h>
> +#include "memset-reg.h"
>  
> -#define MEMSET __memset_emag
> -
> -/*
> - * Using DC ZVA to zero memory does not produce better performance if
> - * memory size is not very large, especially when there are multiple
> - * processes/threads contending memory/cache. Here we set threshold to
> - * zero to disable using DC ZVA, which is good for multi-process/thread
> - * workloads.
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64, unaligned accesses
> + *
>   */
>  
> -#define DC_ZVA_THRESHOLD 0
> +ENTRY (__memset_emag)
> +
> +	PTR_ARG (0)
> +	SIZE_ARG (2)
> +
> +	bfi	valw, valw, 8, 8
> +	bfi	valw, valw, 16, 16
> +	bfi	val, val, 32, 32
> +
> +	add	dstend, dstin, count
> +
> +	cmp	count, 96
> +	b.hi	L(set_long)
> +	cmp	count, 16
> +	b.hs	L(set_medium)
> +
> +	/* Set 0..15 bytes.  */
> +	tbz	count, 3, 1f
> +	str	val, [dstin]
> +	str	val, [dstend, -8]
> +	ret
> +
> +	.p2align 3
> +1:	tbz	count, 2, 2f
> +	str	valw, [dstin]
> +	str	valw, [dstend, -4]
> +	ret
> +2:	cbz	count, 3f
> +	strb	valw, [dstin]
> +	tbz	count, 1, 3f
> +	strh	valw, [dstend, -2]
> +3:	ret
> +
> +	.p2align 3
> +	/* Set 16..96 bytes.  */
> +L(set_medium):
> +	stp	val, val, [dstin]
> +	tbnz	count, 6, L(set96)
> +	stp	val, val, [dstend, -16]
> +	tbz	count, 5, 1f
> +	stp	val, val, [dstin, 16]
> +	stp	val, val, [dstend, -32]
> +1:	ret
> +
> +	.p2align 4
> +	/* Set 64..96 bytes.  Write 64 bytes from the start and
> +	   32 bytes from the end.  */
> +L(set96):
> +	stp	val, val, [dstin, 16]
> +	stp	val, val, [dstin, 32]
> +	stp	val, val, [dstin, 48]
> +	stp	val, val, [dstend, -32]
> +	stp	val, val, [dstend, -16]
> +	ret
> +
> +	.p2align 4
> +L(set_long):
> +	stp	val, val, [dstin]
> +	bic	dst, dstin, 15
> +	/* Small-size or non-zero memset does not use DC ZVA. */
> +	sub	count, dstend, dst
> +
> +	/*
> +	 * Adjust count and bias for loop. By subtracting extra 1 from count,
> +	 * it is easy to use tbz instruction to check whether loop tailing
> +	 * count is less than 33 bytes, so as to bypass 2 unnecessary stps.
> +	 */
> +	sub	count, count, 64+16+1
> +
> +1:	stp	val, val, [dst, 16]
> +	stp	val, val, [dst, 32]
> +	stp	val, val, [dst, 48]
> +	stp	val, val, [dst, 64]!
> +	subs	count, count, 64
> +	b.hs	1b
> +
> +	tbz	count, 5, 1f	/* Remaining count is less than 33 bytes? */
> +	stp	val, val, [dst, 16]
> +	stp	val, val, [dst, 32]
> +1:	stp	val, val, [dstend, -32]
> +	stp	val, val, [dstend, -16]
> +	ret
>  
> -#include "./memset_base64.S"
> +END (__memset_emag)
>
  

Patch

diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 836e8317a5d3b652134d199cf685499983b1a3fc..3596d3c8d3403b4ea07d80d9a8877e2908a9883e 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -57,7 +57,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      /* Enable this on non-falkor processors too so that other cores
 		 can do a comparative analysis with __memset_generic.  */
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
-	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_emag)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
 #if HAVE_AARCH64_SVE_ASM
 	      IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 256, __memset_a64fx)
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 23fc66e15879847557b0e4f6941f03bc7ac5cab9..9193b197ddc3a647768184a6a639d6635cfea96e 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -56,7 +56,7 @@  select_memset_ifunc (void)
   if ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64)
     return __memset_falkor;
 
-  if (IS_EMAG (midr) && zva_size == 64)
+  if (IS_EMAG (midr))
     return __memset_emag;
 
   return __memset_generic;
diff --git a/sysdeps/aarch64/multiarch/memset_base64.S b/sysdeps/aarch64/multiarch/memset_base64.S
deleted file mode 100644
index 0e8f709fa58478d6e9d62020c576bb9be108866c..0000000000000000000000000000000000000000
--- a/sysdeps/aarch64/multiarch/memset_base64.S
+++ /dev/null
@@ -1,185 +0,0 @@ 
-/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
-
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library.  If not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "memset-reg.h"
-
-#ifndef MEMSET
-# define MEMSET __memset_base64
-#endif
-
-/* To disable DC ZVA, set this threshold to 0. */
-#ifndef DC_ZVA_THRESHOLD
-# define DC_ZVA_THRESHOLD 512
-#endif
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, unaligned accesses
- *
- */
-
-ENTRY (MEMSET)
-
-	PTR_ARG (0)
-	SIZE_ARG (2)
-
-	bfi	valw, valw, 8, 8
-	bfi	valw, valw, 16, 16
-	bfi	val, val, 32, 32
-
-	add	dstend, dstin, count
-
-	cmp	count, 96
-	b.hi	L(set_long)
-	cmp	count, 16
-	b.hs	L(set_medium)
-
-	/* Set 0..15 bytes.  */
-	tbz	count, 3, 1f
-	str	val, [dstin]
-	str	val, [dstend, -8]
-	ret
-
-	.p2align 3
-1:	tbz	count, 2, 2f
-	str	valw, [dstin]
-	str	valw, [dstend, -4]
-	ret
-2:	cbz	count, 3f
-	strb	valw, [dstin]
-	tbz	count, 1, 3f
-	strh	valw, [dstend, -2]
-3:	ret
-
-	.p2align 3
-	/* Set 16..96 bytes.  */
-L(set_medium):
-	stp	val, val, [dstin]
-	tbnz	count, 6, L(set96)
-	stp	val, val, [dstend, -16]
-	tbz	count, 5, 1f
-	stp	val, val, [dstin, 16]
-	stp	val, val, [dstend, -32]
-1:	ret
-
-	.p2align 4
-	/* Set 64..96 bytes.  Write 64 bytes from the start and
-	   32 bytes from the end.  */
-L(set96):
-	stp	val, val, [dstin, 16]
-	stp	val, val, [dstin, 32]
-	stp	val, val, [dstin, 48]
-	stp	val, val, [dstend, -32]
-	stp	val, val, [dstend, -16]
-	ret
-
-	.p2align 4
-L(set_long):
-	stp	val, val, [dstin]
-	bic	dst, dstin, 15
-#if DC_ZVA_THRESHOLD
-	cmp	count, DC_ZVA_THRESHOLD
-	ccmp	val, 0, 0, cs
-	b.eq	L(zva_64)
-#endif
-	/* Small-size or non-zero memset does not use DC ZVA. */
-	sub	count, dstend, dst
-
-	/*
-	 * Adjust count and bias for loop. By subtracting extra 1 from count,
-	 * it is easy to use tbz instruction to check whether loop tailing
-	 * count is less than 33 bytes, so as to bypass 2 unnecessary stps.
-	 */
-	sub	count, count, 64+16+1
-
-#if DC_ZVA_THRESHOLD
-	/* Align loop on 16-byte boundary, this might be friendly to i-cache. */
-	nop
-#endif
-
-1:	stp	val, val, [dst, 16]
-	stp	val, val, [dst, 32]
-	stp	val, val, [dst, 48]
-	stp	val, val, [dst, 64]!
-	subs	count, count, 64
-	b.hs	1b
-
-	tbz	count, 5, 1f	/* Remaining count is less than 33 bytes? */
-	stp	val, val, [dst, 16]
-	stp	val, val, [dst, 32]
-1:	stp	val, val, [dstend, -32]
-	stp	val, val, [dstend, -16]
-	ret
-
-#if DC_ZVA_THRESHOLD
-	.p2align 3
-L(zva_64):
-	stp	val, val, [dst, 16]
-	stp	val, val, [dst, 32]
-	stp	val, val, [dst, 48]
-	bic	dst, dst, 63
-
-	/*
-	 * Previous memory writes might cross cache line boundary, and cause
-	 * cache line partially dirty. Zeroing this kind of cache line using
-	 * DC ZVA will incur extra cost, for it requires loading untouched
-	 * part of the line from memory before zeoring.
-	 *
-	 * So, write the first 64 byte aligned block using stp to force
-	 * fully dirty cache line.
-	 */
-	stp	val, val, [dst, 64]
-	stp	val, val, [dst, 80]
-	stp	val, val, [dst, 96]
-	stp	val, val, [dst, 112]
-
-	sub	count, dstend, dst
-	/*
-	 * Adjust count and bias for loop. By subtracting extra 1 from count,
-	 * it is easy to use tbz instruction to check whether loop tailing
-	 * count is less than 33 bytes, so as to bypass 2 unnecessary stps.
-	 */
-	sub	count, count, 128+64+64+1
-	add	dst, dst, 128
-	nop
-
-	/* DC ZVA sets 64 bytes each time. */
-1:	dc	zva, dst
-	add	dst, dst, 64
-	subs	count, count, 64
-	b.hs	1b
-
-	/*
-	 * Write the last 64 byte aligned block using stp to force fully
-	 * dirty cache line.
-	 */
-	stp	val, val, [dst, 0]
-	stp	val, val, [dst, 16]
-	stp	val, val, [dst, 32]
-	stp	val, val, [dst, 48]
-
-	tbz	count, 5, 1f	/* Remaining count is less than 33 bytes? */
-	stp	val, val, [dst, 64]
-	stp	val, val, [dst, 80]
-1:	stp	val, val, [dstend, -32]
-	stp	val, val, [dstend, -16]
-	ret
-#endif
-
-END (MEMSET)
diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
index 6fecad4fae699f9967da94ddc88867afd5c59414..bbfa815925899149e2313a9317380fa9fd089abd 100644
--- a/sysdeps/aarch64/multiarch/memset_emag.S
+++ b/sysdeps/aarch64/multiarch/memset_emag.S
@@ -18,17 +18,95 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#include "memset-reg.h"
 
-#define MEMSET __memset_emag
-
-/*
- * Using DC ZVA to zero memory does not produce better performance if
- * memory size is not very large, especially when there are multiple
- * processes/threads contending memory/cache. Here we set threshold to
- * zero to disable using DC ZVA, which is good for multi-process/thread
- * workloads.
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ *
  */
 
-#define DC_ZVA_THRESHOLD 0
+ENTRY (__memset_emag)
+
+	PTR_ARG (0)
+	SIZE_ARG (2)
+
+	bfi	valw, valw, 8, 8
+	bfi	valw, valw, 16, 16
+	bfi	val, val, 32, 32
+
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+	cmp	count, 16
+	b.hs	L(set_medium)
+
+	/* Set 0..15 bytes.  */
+	tbz	count, 3, 1f
+	str	val, [dstin]
+	str	val, [dstend, -8]
+	ret
+
+	.p2align 3
+1:	tbz	count, 2, 2f
+	str	valw, [dstin]
+	str	valw, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	strb	valw, [dstin]
+	tbz	count, 1, 3f
+	strh	valw, [dstend, -2]
+3:	ret
+
+	.p2align 3
+	/* Set 16..96 bytes.  */
+L(set_medium):
+	stp	val, val, [dstin]
+	tbnz	count, 6, L(set96)
+	stp	val, val, [dstend, -16]
+	tbz	count, 5, 1f
+	stp	val, val, [dstin, 16]
+	stp	val, val, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	stp	val, val, [dstin, 16]
+	stp	val, val, [dstin, 32]
+	stp	val, val, [dstin, 48]
+	stp	val, val, [dstend, -32]
+	stp	val, val, [dstend, -16]
+	ret
+
+	.p2align 4
+L(set_long):
+	stp	val, val, [dstin]
+	bic	dst, dstin, 15
+	/* Small-size or non-zero memset does not use DC ZVA. */
+	sub	count, dstend, dst
+
+	/*
+	 * Adjust count and bias for loop. By subtracting extra 1 from count,
+	 * it is easy to use tbz instruction to check whether loop tailing
+	 * count is less than 33 bytes, so as to bypass 2 unnecessary stps.
+	 */
+	sub	count, count, 64+16+1
+
+1:	stp	val, val, [dst, 16]
+	stp	val, val, [dst, 32]
+	stp	val, val, [dst, 48]
+	stp	val, val, [dst, 64]!
+	subs	count, count, 64
+	b.hs	1b
+
+	tbz	count, 5, 1f	/* Remaining count is less than 33 bytes? */
+	stp	val, val, [dst, 16]
+	stp	val, val, [dst, 32]
+1:	stp	val, val, [dstend, -32]
+	stp	val, val, [dstend, -16]
+	ret
 
-#include "./memset_base64.S"
+END (__memset_emag)