@@ -453,7 +453,8 @@ This tunable is specific to powerpc, powerpc64 and powerpc64le.
The @code{glibc.cpu.name=xxx} tunable allows the user to tell @theglibc{} to
assume that the CPU is @code{xxx} where xxx may have one of these values:
@code{generic}, @code{falkor}, @code{thunderxt88}, @code{thunderx2t99},
-@code{thunderx2t99p1}, @code{ares}, @code{emag}, @code{kunpeng}.
+@code{thunderx2t99p1}, @code{ares}, @code{emag}, @code{kunpeng},
+@code{a64fx}.
This tunable is specific to aarch64.
@end deftp
@@ -1,6 +1,6 @@
ifeq ($(subdir),string)
sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \
- memcpy_falkor \
+ memcpy_falkor memcpy_a64fx \
memset_generic memset_falkor memset_emag memset_kunpeng \
memchr_generic memchr_nosimd \
strlen_mte strlen_asimd
@@ -25,7 +25,11 @@
#include <stdio.h>
/* Maximum number of IFUNC implementations. */
-#define MAX_IFUNC 4
+#if HAVE_SVE_ASM_SUPPORT
+# define MAX_IFUNC 7
+#else
+# define MAX_IFUNC 6
+#endif
size_t
__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
@@ -43,12 +47,18 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
+#if HAVE_SVE_ASM_SUPPORT
+ IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
+#endif
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
IFUNC_IMPL (i, name, memmove,
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
+#if HAVE_SVE_ASM_SUPPORT
+ IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
+#endif
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
IFUNC_IMPL (i, name, memset,
/* Enable this on non-falkor processors too so that other cores
@@ -33,4 +33,6 @@
bool __attribute__((unused)) bti = \
HAVE_AARCH64_BTI && GLRO(dl_aarch64_cpu_features).bti; \
bool __attribute__((unused)) mte = \
- MTE_ENABLED ();
+ MTE_ENABLED (); \
+ unsigned __attribute__((unused)) sve = \
+ GLRO(dl_aarch64_cpu_features).sve;
@@ -33,6 +33,9 @@ extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
+#if HAVE_SVE_ASM_SUPPORT
+extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
+#endif
libc_ifunc (__libc_memcpy,
(IS_THUNDERX (midr)
@@ -44,8 +47,13 @@ libc_ifunc (__libc_memcpy,
: (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
|| IS_NEOVERSE_V1 (midr)
? __memcpy_simd
- : __memcpy_generic)))));
-
+#if HAVE_SVE_ASM_SUPPORT
+ : (IS_A64FX (midr)
+ ? __memcpy_a64fx
+ : __memcpy_generic))))));
+#else
+ : __memcpy_generic)))));
+#endif
# undef memcpy
strong_alias (__libc_memcpy, memcpy);
#endif
new file mode 100644
@@ -0,0 +1,979 @@
+/* Optimized memcpy for Fujitsu A64FX processor.
+ Copyright (C) 2012-2021 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#if HAVE_SVE_ASM_SUPPORT
+#if IS_IN (libc)
+# define MEMCPY __memcpy_a64fx
+# define MEMMOVE __memmove_a64fx
+
+/* Assumptions:
+ *
+ * ARMv8.2-a, AArch64, unaligned accesses, sve
+ *
+ */
+
+#define L1_SIZE (64*1024)/2 // L1 64KB
+#define L2_SIZE (7*1024*1024)/2 // L2 8MB - 1MB
+#define CACHE_LINE_SIZE 256
+#define PF_DIST_L1 (CACHE_LINE_SIZE * 16)
+#define PF_DIST_L2 (CACHE_LINE_SIZE * 64)
+#define dest x0
+#define src x1
+#define n x2 // size
+#define tmp1 x3
+#define tmp2 x4
+#define rest x5
+#define dest_ptr x6
+#define src_ptr x7
+#define vector_length x8
+#define vl_remainder x9 // vector_length remainder
+#define cl_remainder x10 // CACHE_LINE_SIZE remainder
+
+ .arch armv8.2-a+sve
+
+ENTRY_ALIGN (MEMCPY, 6)
+
+ PTR_ARG (0)
+ SIZE_ARG (2)
+
+L(fwd_start):
+ cmp n, 0
+ ccmp dest, src, 4, ne
+ b.ne L(init)
+ ret
+
+L(init):
+ mov rest, n
+ mov dest_ptr, dest
+ mov src_ptr, src
+ cntb vector_length
+ ptrue p0.b
+
+L(L2):
+ // get block_size
+ mrs tmp1, dczid_el0
+ cmp tmp1, 6 // CACHE_LINE_SIZE 256
+ b.ne L(vl_agnostic)
+
+ // if rest >= L2_SIZE
+ cmp rest, L2_SIZE
+ b.cc L(L1_prefetch)
+ // align dest address at vector_length byte boundary
+ sub tmp1, vector_length, 1
+ and tmp2, dest_ptr, tmp1
+ // if vl_remainder == 0
+ cmp tmp2, 0
+ b.eq 1f
+ sub vl_remainder, vector_length, tmp2
+ // process remainder until the first vector_length boundary
+ whilelt p0.b, xzr, vl_remainder
+ ld1b z0.b, p0/z, [src_ptr]
+ st1b z0.b, p0, [dest_ptr]
+ add dest_ptr, dest_ptr, vl_remainder
+ add src_ptr, src_ptr, vl_remainder
+ sub rest, rest, vl_remainder
+ // align dest address at CACHE_LINE_SIZE byte boundary
+1: mov tmp1, CACHE_LINE_SIZE
+ and tmp2, dest_ptr, CACHE_LINE_SIZE - 1
+ // if cl_remainder == 0
+ cmp tmp2, 0
+ b.eq L(L2_dc_zva)
+ sub cl_remainder, tmp1, tmp2
+ // process remainder until the first CACHE_LINE_SIZE boundary
+ mov tmp1, xzr // index
+2: whilelt p0.b, tmp1, cl_remainder
+ ld1b z0.b, p0/z, [src_ptr, tmp1]
+ st1b z0.b, p0, [dest_ptr, tmp1]
+ incb tmp1
+ cmp tmp1, cl_remainder
+ b.lo 2b
+ add dest_ptr, dest_ptr, cl_remainder
+ add src_ptr, src_ptr, cl_remainder
+ sub rest, rest, cl_remainder
+
+L(L2_dc_zva): // unroll zero fill
+ and tmp1, dest, 0xffffffffffffff
+ and tmp2, src, 0xffffffffffffff
+ sub tmp1, tmp2, tmp1 // diff
+ mov tmp2, CACHE_LINE_SIZE * 20
+ cmp tmp1, tmp2
+ b.lo L(L1_prefetch)
+ mov tmp1, dest_ptr
+ dc zva, tmp1 // 1
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 2
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 3
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 4
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 5
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 6
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 7
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 8
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 9
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 10
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 11
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 12
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 13
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 14
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 15
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 16
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 17
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 18
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 19
+ add tmp1, tmp1, CACHE_LINE_SIZE
+ dc zva, tmp1 // 20
+
+L(L2_vl_64): // VL64 unroll8
+ cmp vector_length, 64
+ b.ne L(L2_vl_32)
+ ptrue p0.b
+ .p2align 3
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ add src_ptr, src_ptr, CACHE_LINE_SIZE * 2
+ sub rest, rest, CACHE_LINE_SIZE * 2
+1: st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ mov tmp1, PF_DIST_L1
+ prfm pstl1keep, [dest_ptr, tmp1]
+ mov tmp1, PF_DIST_L2
+ prfm pstl2keep, [dest_ptr, tmp1]
+ mov tmp2, CACHE_LINE_SIZE * 19
+ add tmp2, dest_ptr, tmp2
+ dc zva, tmp2 // distance CACHE_LINE_SIZE * 19
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ mov tmp1, PF_DIST_L1 + CACHE_LINE_SIZE
+ prfm pstl1keep, [dest_ptr, tmp1]
+ mov tmp1, PF_DIST_L2 + CACHE_LINE_SIZE
+ prfm pstl2keep, [dest_ptr, tmp1]
+ add tmp2, tmp2, CACHE_LINE_SIZE
+ dc zva, tmp2 // distance CACHE_LINE_SIZE * 20
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
+ add src_ptr, src_ptr, CACHE_LINE_SIZE * 2
+ sub rest, rest, CACHE_LINE_SIZE * 2
+ cmp rest, L2_SIZE
+ b.ge 1b
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
+
+L(L2_vl_32): // VL32 unroll6
+ cmp vector_length, 32
+ b.ne L(L2_vl_16)
+ ptrue p0.b
+ .p2align 3
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ add src_ptr, src_ptr, CACHE_LINE_SIZE
+ sub rest, rest, CACHE_LINE_SIZE
+1: st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ mov tmp1, PF_DIST_L1
+ prfm pstl1keep, [dest_ptr, tmp1]
+ mov tmp1, PF_DIST_L2
+ prfm pstl2keep, [dest_ptr, tmp1]
+ mov tmp2, CACHE_LINE_SIZE * 19
+ add tmp2, dest_ptr, tmp2
+ dc zva, tmp2 // distance CACHE_LINE_SIZE * 19
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE
+ add src_ptr, src_ptr, CACHE_LINE_SIZE
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ mov tmp1, PF_DIST_L1 + CACHE_LINE_SIZE
+ prfm pstl1keep, [dest_ptr, tmp1]
+ mov tmp1, PF_DIST_L2 + CACHE_LINE_SIZE
+ prfm pstl2keep, [dest_ptr, tmp1]
+ add tmp2, tmp2, CACHE_LINE_SIZE
+ dc zva, tmp2 // distance CACHE_LINE_SIZE * 20
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE
+ add src_ptr, src_ptr, CACHE_LINE_SIZE
+ sub rest, rest, CACHE_LINE_SIZE * 2
+ cmp rest, L2_SIZE
+ b.ge 1b
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE
+
+L(L2_vl_16): // VL16 unroll32
+ cmp vector_length, 16
+ b.ne L(L1_prefetch)
+ ptrue p0.b
+ .p2align 3
+ add src_ptr, src_ptr, CACHE_LINE_SIZE / 2
+ ld1b z16.b, p0/z, [src_ptr, #-8, mul vl]
+ ld1b z17.b, p0/z, [src_ptr, #-7, mul vl]
+ ld1b z18.b, p0/z, [src_ptr, #-6, mul vl]
+ ld1b z19.b, p0/z, [src_ptr, #-5, mul vl]
+ ld1b z20.b, p0/z, [src_ptr, #-4, mul vl]
+ ld1b z21.b, p0/z, [src_ptr, #-3, mul vl]
+ ld1b z22.b, p0/z, [src_ptr, #-2, mul vl]
+ ld1b z23.b, p0/z, [src_ptr, #-1, mul vl]
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ add src_ptr, src_ptr, CACHE_LINE_SIZE / 2
+ sub rest, rest, CACHE_LINE_SIZE
+1: add dest_ptr, dest_ptr, CACHE_LINE_SIZE / 2
+ add src_ptr, src_ptr, CACHE_LINE_SIZE / 2
+ st1b z16.b, p0, [dest_ptr, #-8, mul vl]
+ st1b z17.b, p0, [dest_ptr, #-7, mul vl]
+ ld1b z16.b, p0/z, [src_ptr, #-8, mul vl]
+ ld1b z17.b, p0/z, [src_ptr, #-7, mul vl]
+ st1b z18.b, p0, [dest_ptr, #-6, mul vl]
+ st1b z19.b, p0, [dest_ptr, #-5, mul vl]
+ ld1b z18.b, p0/z, [src_ptr, #-6, mul vl]
+ ld1b z19.b, p0/z, [src_ptr, #-5, mul vl]
+ st1b z20.b, p0, [dest_ptr, #-4, mul vl]
+ st1b z21.b, p0, [dest_ptr, #-3, mul vl]
+ ld1b z20.b, p0/z, [src_ptr, #-4, mul vl]
+ ld1b z21.b, p0/z, [src_ptr, #-3, mul vl]
+ st1b z22.b, p0, [dest_ptr, #-2, mul vl]
+ st1b z23.b, p0, [dest_ptr, #-1, mul vl]
+ ld1b z22.b, p0/z, [src_ptr, #-2, mul vl]
+ ld1b z23.b, p0/z, [src_ptr, #-1, mul vl]
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ mov tmp1, PF_DIST_L1
+ prfm pstl1keep, [dest_ptr, tmp1]
+ mov tmp1, PF_DIST_L2
+ prfm pstl2keep, [dest_ptr, tmp1]
+ mov tmp2, CACHE_LINE_SIZE * 19
+ add tmp2, dest_ptr, tmp2
+ dc zva, tmp2 // distance CACHE_LINE_SIZE * 19
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE
+ add src_ptr, src_ptr, CACHE_LINE_SIZE
+ st1b z16.b, p0, [dest_ptr, #-8, mul vl]
+ st1b z17.b, p0, [dest_ptr, #-7, mul vl]
+ ld1b z16.b, p0/z, [src_ptr, #-8, mul vl]
+ ld1b z17.b, p0/z, [src_ptr, #-7, mul vl]
+ st1b z18.b, p0, [dest_ptr, #-6, mul vl]
+ st1b z19.b, p0, [dest_ptr, #-5, mul vl]
+ ld1b z18.b, p0/z, [src_ptr, #-6, mul vl]
+ ld1b z19.b, p0/z, [src_ptr, #-5, mul vl]
+ st1b z20.b, p0, [dest_ptr, #-4, mul vl]
+ st1b z21.b, p0, [dest_ptr, #-3, mul vl]
+ ld1b z20.b, p0/z, [src_ptr, #-4, mul vl]
+ ld1b z21.b, p0/z, [src_ptr, #-3, mul vl]
+ st1b z22.b, p0, [dest_ptr, #-2, mul vl]
+ st1b z23.b, p0, [dest_ptr, #-1, mul vl]
+ ld1b z22.b, p0/z, [src_ptr, #-2, mul vl]
+ ld1b z23.b, p0/z, [src_ptr, #-1, mul vl]
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ mov tmp1, PF_DIST_L1 + CACHE_LINE_SIZE
+ prfm pstl1keep, [dest_ptr, tmp1]
+ mov tmp1, PF_DIST_L2 + CACHE_LINE_SIZE
+ prfm pstl2keep, [dest_ptr, tmp1]
+ add tmp2, tmp2, CACHE_LINE_SIZE
+ dc zva, tmp2 // distance CACHE_LINE_SIZE * 20
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE / 2
+ add src_ptr, src_ptr, CACHE_LINE_SIZE / 2
+ sub rest, rest, CACHE_LINE_SIZE * 2
+ cmp rest, L2_SIZE
+ b.ge 1b
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE / 2
+ st1b z16.b, p0, [dest_ptr, #-8, mul vl]
+ st1b z17.b, p0, [dest_ptr, #-7, mul vl]
+ st1b z18.b, p0, [dest_ptr, #-6, mul vl]
+ st1b z19.b, p0, [dest_ptr, #-5, mul vl]
+ st1b z20.b, p0, [dest_ptr, #-4, mul vl]
+ st1b z21.b, p0, [dest_ptr, #-3, mul vl]
+ st1b z22.b, p0, [dest_ptr, #-2, mul vl]
+ st1b z23.b, p0, [dest_ptr, #-1, mul vl]
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE / 2
+
+L(L1_prefetch): // if rest >= L1_SIZE
+ cmp rest, L1_SIZE
+ b.cc L(vl_agnostic)
+L(L1_vl_64):
+ cmp vector_length, 64
+ b.ne L(L1_vl_32)
+ ptrue p0.b
+ .p2align 3
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ add src_ptr, src_ptr, CACHE_LINE_SIZE * 2
+ sub rest, rest, CACHE_LINE_SIZE * 2
+1: st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ mov tmp1, PF_DIST_L1
+ prfm pstl1keep, [dest_ptr, tmp1]
+ mov tmp1, PF_DIST_L2
+ prfm pstl2keep, [dest_ptr, tmp1]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ mov tmp1, PF_DIST_L1 + CACHE_LINE_SIZE
+ prfm pstl1keep, [dest_ptr, tmp1]
+ mov tmp1, PF_DIST_L2 + CACHE_LINE_SIZE
+ prfm pstl2keep, [dest_ptr, tmp1]
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
+ add src_ptr, src_ptr, CACHE_LINE_SIZE * 2
+ sub rest, rest, CACHE_LINE_SIZE * 2
+ cmp rest, L1_SIZE
+ b.ge 1b
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
+
+L(L1_vl_32):
+ cmp vector_length, 32
+ b.ne L(L1_vl_16)
+ ptrue p0.b
+ .p2align 3
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ add src_ptr, src_ptr, CACHE_LINE_SIZE
+ sub rest, rest, CACHE_LINE_SIZE
+1: st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ mov tmp1, PF_DIST_L1
+ prfm pstl1keep, [dest_ptr, tmp1]
+ mov tmp1, PF_DIST_L2
+ prfm pstl2keep, [dest_ptr, tmp1]
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE
+ add src_ptr, src_ptr, CACHE_LINE_SIZE
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ mov tmp1, PF_DIST_L1 + CACHE_LINE_SIZE
+ prfm pstl1keep, [dest_ptr, tmp1]
+ mov tmp1, PF_DIST_L2 + CACHE_LINE_SIZE
+ prfm pstl2keep, [dest_ptr, tmp1]
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE
+ add src_ptr, src_ptr, CACHE_LINE_SIZE
+ sub rest, rest, CACHE_LINE_SIZE * 2
+ cmp rest, L1_SIZE
+ b.ge 1b
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE
+
+L(L1_vl_16):
+ cmp vector_length, 16
+ b.ne L(vl_agnostic)
+ ptrue p0.b
+ .p2align 3
+ add src_ptr, src_ptr, CACHE_LINE_SIZE / 2
+ ld1b z16.b, p0/z, [src_ptr, #-8, mul vl]
+ ld1b z17.b, p0/z, [src_ptr, #-7, mul vl]
+ ld1b z18.b, p0/z, [src_ptr, #-6, mul vl]
+ ld1b z19.b, p0/z, [src_ptr, #-5, mul vl]
+ ld1b z20.b, p0/z, [src_ptr, #-4, mul vl]
+ ld1b z21.b, p0/z, [src_ptr, #-3, mul vl]
+ ld1b z22.b, p0/z, [src_ptr, #-2, mul vl]
+ ld1b z23.b, p0/z, [src_ptr, #-1, mul vl]
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ add src_ptr, src_ptr, CACHE_LINE_SIZE / 2
+ sub rest, rest, CACHE_LINE_SIZE
+1: add dest_ptr, dest_ptr, CACHE_LINE_SIZE / 2
+ add src_ptr, src_ptr, CACHE_LINE_SIZE / 2
+ st1b z16.b, p0, [dest_ptr, #-8, mul vl]
+ st1b z17.b, p0, [dest_ptr, #-7, mul vl]
+ ld1b z16.b, p0/z, [src_ptr, #-8, mul vl]
+ ld1b z17.b, p0/z, [src_ptr, #-7, mul vl]
+ st1b z18.b, p0, [dest_ptr, #-6, mul vl]
+ st1b z19.b, p0, [dest_ptr, #-5, mul vl]
+ ld1b z18.b, p0/z, [src_ptr, #-6, mul vl]
+ ld1b z19.b, p0/z, [src_ptr, #-5, mul vl]
+ st1b z20.b, p0, [dest_ptr, #-4, mul vl]
+ st1b z21.b, p0, [dest_ptr, #-3, mul vl]
+ ld1b z20.b, p0/z, [src_ptr, #-4, mul vl]
+ ld1b z21.b, p0/z, [src_ptr, #-3, mul vl]
+ st1b z22.b, p0, [dest_ptr, #-2, mul vl]
+ st1b z23.b, p0, [dest_ptr, #-1, mul vl]
+ ld1b z22.b, p0/z, [src_ptr, #-2, mul vl]
+ ld1b z23.b, p0/z, [src_ptr, #-1, mul vl]
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ mov tmp1, PF_DIST_L1
+ prfm pstl1keep, [dest_ptr, tmp1]
+ mov tmp1, PF_DIST_L2
+ prfm pstl2keep, [dest_ptr, tmp1]
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE
+ add src_ptr, src_ptr, CACHE_LINE_SIZE
+ st1b z16.b, p0, [dest_ptr, #-8, mul vl]
+ st1b z17.b, p0, [dest_ptr, #-7, mul vl]
+ ld1b z16.b, p0/z, [src_ptr, #-8, mul vl]
+ ld1b z17.b, p0/z, [src_ptr, #-7, mul vl]
+ st1b z18.b, p0, [dest_ptr, #-6, mul vl]
+ st1b z19.b, p0, [dest_ptr, #-5, mul vl]
+ ld1b z18.b, p0/z, [src_ptr, #-6, mul vl]
+ ld1b z19.b, p0/z, [src_ptr, #-5, mul vl]
+ st1b z20.b, p0, [dest_ptr, #-4, mul vl]
+ st1b z21.b, p0, [dest_ptr, #-3, mul vl]
+ ld1b z20.b, p0/z, [src_ptr, #-4, mul vl]
+ ld1b z21.b, p0/z, [src_ptr, #-3, mul vl]
+ st1b z22.b, p0, [dest_ptr, #-2, mul vl]
+ st1b z23.b, p0, [dest_ptr, #-1, mul vl]
+ ld1b z22.b, p0/z, [src_ptr, #-2, mul vl]
+ ld1b z23.b, p0/z, [src_ptr, #-1, mul vl]
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ mov tmp1, PF_DIST_L1 + CACHE_LINE_SIZE
+ prfm pstl1keep, [dest_ptr, tmp1]
+ mov tmp1, PF_DIST_L2 + CACHE_LINE_SIZE
+ prfm pstl2keep, [dest_ptr, tmp1]
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE / 2
+ add src_ptr, src_ptr, CACHE_LINE_SIZE / 2
+ sub rest, rest, CACHE_LINE_SIZE * 2
+ cmp rest, L1_SIZE
+ b.ge 1b
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE / 2
+ st1b z16.b, p0, [dest_ptr, #-8, mul vl]
+ st1b z17.b, p0, [dest_ptr, #-7, mul vl]
+ st1b z18.b, p0, [dest_ptr, #-6, mul vl]
+ st1b z19.b, p0, [dest_ptr, #-5, mul vl]
+ st1b z20.b, p0, [dest_ptr, #-4, mul vl]
+ st1b z21.b, p0, [dest_ptr, #-3, mul vl]
+ st1b z22.b, p0, [dest_ptr, #-2, mul vl]
+ st1b z23.b, p0, [dest_ptr, #-1, mul vl]
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE / 2
+
+L(vl_agnostic): // VL Agnostic
+
+L(unroll32): // unrolling and software pipeline
+ lsl tmp1, vector_length, 3 // vector_length * 8
+ lsl tmp2, vector_length, 5 // vector_length * 32
+ ptrue p0.b
+ .p2align 3
+1: cmp rest, tmp2
+ b.cc L(unroll8)
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ add dest_ptr, dest_ptr, tmp1
+ add src_ptr, src_ptr, tmp1
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ add dest_ptr, dest_ptr, tmp1
+ add src_ptr, src_ptr, tmp1
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ add dest_ptr, dest_ptr, tmp1
+ add src_ptr, src_ptr, tmp1
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ add dest_ptr, dest_ptr, tmp1
+ add src_ptr, src_ptr, tmp1
+ sub rest, rest, tmp2
+ b 1b
+
+L(unroll8): // unrolling and software pipeline
+ lsl tmp1, vector_length, 3 // vector_length * 8
+ ptrue p0.b
+ .p2align 3
+1: cmp rest, tmp1
+ b.cc L(unroll1)
+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
+ st1b z0.b, p0, [dest_ptr, #0, mul vl]
+ st1b z1.b, p0, [dest_ptr, #1, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
+ st1b z2.b, p0, [dest_ptr, #2, mul vl]
+ st1b z3.b, p0, [dest_ptr, #3, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
+ st1b z4.b, p0, [dest_ptr, #4, mul vl]
+ st1b z5.b, p0, [dest_ptr, #5, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
+ st1b z6.b, p0, [dest_ptr, #6, mul vl]
+ st1b z7.b, p0, [dest_ptr, #7, mul vl]
+ add dest_ptr, dest_ptr, tmp1
+ add src_ptr, src_ptr, tmp1
+ sub rest, rest, tmp1
+ b 1b
+
+ L(unroll1):
+ ptrue p0.b
+ .p2align 3
+1: cmp rest, vector_length
+ b.cc L(last)
+ ld1b z0.b, p0/z, [src_ptr]
+ st1b z0.b, p0, [dest_ptr]
+ add dest_ptr, dest_ptr, vector_length
+ add src_ptr, src_ptr, vector_length
+ sub rest, rest, vector_length
+ b 1b
+
+L(last):
+ whilelt p0.b, xzr, rest
+ ld1b z0.b, p0/z, [src_ptr]
+ st1b z0.b, p0, [dest_ptr]
+ ret
+
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+
+
+ .p2align 4
+ENTRY_ALIGN (MEMMOVE, 6)
+
+ // remove tag address
+ and tmp1, dest, 0xffffffffffffff
+ and tmp2, src, 0xffffffffffffff
+ sub tmp1, tmp1, tmp2 // diff
+ // if diff <= 0 || diff >= n then memcpy
+ cmp tmp1, 0
+ ccmp tmp1, n, 2, gt
+ b.cs L(fwd_start)
+
+L(bwd_start):
+ mov rest, n
+ add dest_ptr, dest, n // dest_end
+ add src_ptr, src, n // src_end
+ cntb vector_length
+ ptrue p0.b
+ udiv tmp1, n, vector_length // quotient
+ mul tmp1, tmp1, vector_length // product
+ sub vl_remainder, n, tmp1
+ // if bwd_remainder == 0 then skip vl_remainder bwd copy
+ cmp vl_remainder, 0
+ b.eq L(bwd_main)
+ // vl_remainder bwd copy
+ whilelt p0.b, xzr, vl_remainder
+ sub src_ptr, src_ptr, vl_remainder
+ sub dest_ptr, dest_ptr, vl_remainder
+ ld1b z0.b, p0/z, [src_ptr]
+ st1b z0.b, p0, [dest_ptr]
+ sub rest, rest, vl_remainder
+
+L(bwd_main):
+
+ // VL Agnostic
+L(bwd_unroll32): // unrolling and software pipeline
+ lsl tmp1, vector_length, 3 // vector_length * 8
+ lsl tmp2, vector_length, 5 // vector_length * 32
+ ptrue p0.b
+ .p2align 3
+1: cmp rest, tmp2
+ b.cc L(bwd_unroll8)
+ sub src_ptr, src_ptr, tmp1
+ sub dest_ptr, dest_ptr, tmp1
+ ld1b z0.b, p0/z, [src_ptr, #7, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #6, mul vl]
+ st1b z0.b, p0, [dest_ptr, #7, mul vl]
+ st1b z1.b, p0, [dest_ptr, #6, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #5, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #4, mul vl]
+ st1b z2.b, p0, [dest_ptr, #5, mul vl]
+ st1b z3.b, p0, [dest_ptr, #4, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #3, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #2, mul vl]
+ st1b z4.b, p0, [dest_ptr, #3, mul vl]
+ st1b z5.b, p0, [dest_ptr, #2, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #1, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #0, mul vl]
+ st1b z6.b, p0, [dest_ptr, #1, mul vl]
+ st1b z7.b, p0, [dest_ptr, #0, mul vl]
+ sub src_ptr, src_ptr, tmp1
+ sub dest_ptr, dest_ptr, tmp1
+ ld1b z0.b, p0/z, [src_ptr, #7, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #6, mul vl]
+ st1b z0.b, p0, [dest_ptr, #7, mul vl]
+ st1b z1.b, p0, [dest_ptr, #6, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #5, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #4, mul vl]
+ st1b z2.b, p0, [dest_ptr, #5, mul vl]
+ st1b z3.b, p0, [dest_ptr, #4, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #3, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #2, mul vl]
+ st1b z4.b, p0, [dest_ptr, #3, mul vl]
+ st1b z5.b, p0, [dest_ptr, #2, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #1, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #0, mul vl]
+ st1b z6.b, p0, [dest_ptr, #1, mul vl]
+ st1b z7.b, p0, [dest_ptr, #0, mul vl]
+ sub src_ptr, src_ptr, tmp1
+ sub dest_ptr, dest_ptr, tmp1
+ ld1b z0.b, p0/z, [src_ptr, #7, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #6, mul vl]
+ st1b z0.b, p0, [dest_ptr, #7, mul vl]
+ st1b z1.b, p0, [dest_ptr, #6, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #5, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #4, mul vl]
+ st1b z2.b, p0, [dest_ptr, #5, mul vl]
+ st1b z3.b, p0, [dest_ptr, #4, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #3, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #2, mul vl]
+ st1b z4.b, p0, [dest_ptr, #3, mul vl]
+ st1b z5.b, p0, [dest_ptr, #2, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #1, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #0, mul vl]
+ st1b z6.b, p0, [dest_ptr, #1, mul vl]
+ st1b z7.b, p0, [dest_ptr, #0, mul vl]
+ sub src_ptr, src_ptr, tmp1
+ sub dest_ptr, dest_ptr, tmp1
+ ld1b z0.b, p0/z, [src_ptr, #7, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #6, mul vl]
+ st1b z0.b, p0, [dest_ptr, #7, mul vl]
+ st1b z1.b, p0, [dest_ptr, #6, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #5, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #4, mul vl]
+ st1b z2.b, p0, [dest_ptr, #5, mul vl]
+ st1b z3.b, p0, [dest_ptr, #4, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #3, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #2, mul vl]
+ st1b z4.b, p0, [dest_ptr, #3, mul vl]
+ st1b z5.b, p0, [dest_ptr, #2, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #1, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #0, mul vl]
+ st1b z6.b, p0, [dest_ptr, #1, mul vl]
+ st1b z7.b, p0, [dest_ptr, #0, mul vl]
+ sub rest, rest, tmp2
+ b 1b
+
+L(bwd_unroll8): // unrolling and software pipeline
+ lsl tmp1, vector_length, 3 // vector_length * 8
+ ptrue p0.b
+ .p2align 3
+1: cmp rest, tmp1
+ b.cc L(bwd_unroll1)
+ sub src_ptr, src_ptr, tmp1
+ sub dest_ptr, dest_ptr, tmp1
+ ld1b z0.b, p0/z, [src_ptr, #7, mul vl]
+ ld1b z1.b, p0/z, [src_ptr, #6, mul vl]
+ st1b z0.b, p0, [dest_ptr, #7, mul vl]
+ st1b z1.b, p0, [dest_ptr, #6, mul vl]
+ ld1b z2.b, p0/z, [src_ptr, #5, mul vl]
+ ld1b z3.b, p0/z, [src_ptr, #4, mul vl]
+ st1b z2.b, p0, [dest_ptr, #5, mul vl]
+ st1b z3.b, p0, [dest_ptr, #4, mul vl]
+ ld1b z4.b, p0/z, [src_ptr, #3, mul vl]
+ ld1b z5.b, p0/z, [src_ptr, #2, mul vl]
+ st1b z4.b, p0, [dest_ptr, #3, mul vl]
+ st1b z5.b, p0, [dest_ptr, #2, mul vl]
+ ld1b z6.b, p0/z, [src_ptr, #1, mul vl]
+ ld1b z7.b, p0/z, [src_ptr, #0, mul vl]
+ st1b z6.b, p0, [dest_ptr, #1, mul vl]
+ st1b z7.b, p0, [dest_ptr, #0, mul vl]
+ sub rest, rest, tmp1
+ b 1b
+
+ .p2align 3
+L(bwd_unroll1):
+ ptrue p0.b
+1: cmp rest, vector_length
+ b.cc L(bwd_last)
+ sub src_ptr, src_ptr, vector_length
+ sub dest_ptr, dest_ptr, vector_length
+ ld1b z0.b, p0/z, [src_ptr]
+ st1b z0.b, p0, [dest_ptr]
+ sub rest, rest, vector_length
+ b 1b
+
+L(bwd_last):
+ whilelt p0.b, xzr, rest
+ sub src_ptr, src_ptr, rest
+ sub dest_ptr, dest_ptr, rest
+ ld1b z0.b, p0/z, [src_ptr]
+ st1b z0.b, p0, [dest_ptr]
+ ret
+
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+#endif /* IS_IN (libc) */
+#endif /* HAVE_SVE_ASM_SUPPORT */
+
@@ -33,6 +33,9 @@ extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
+#if HAVE_SVE_ASM_SUPPORT
+extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
+#endif
libc_ifunc (__libc_memmove,
(IS_THUNDERX (midr)
@@ -44,8 +47,13 @@ libc_ifunc (__libc_memmove,
: (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
|| IS_NEOVERSE_V1 (midr)
? __memmove_simd
- : __memmove_generic)))));
-
+#if HAVE_SVE_ASM_SUPPORT
+ : (IS_A64FX (midr)
+ ? __memmove_a64fx
+ : __memmove_generic))))));
+#else
+ : __memmove_generic)))));
+#endif
# undef memmove
strong_alias (__libc_memmove, memmove);
#endif
@@ -46,6 +46,7 @@ static struct cpu_list cpu_list[] = {
{"ares", 0x411FD0C0},
{"emag", 0x503F0001},
{"kunpeng920", 0x481FD010},
+ {"a64fx", 0x460F0010},
{"generic", 0x0}
};
@@ -116,4 +117,7 @@ init_cpu_features (struct cpu_features *cpu_features)
(PR_TAGGED_ADDR_ENABLE | PR_MTE_TCF_ASYNC | MTE_ALLOWED_TAGS),
0, 0, 0);
#endif
+
+ /* Check if SVE is supported. */
+ cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE;
}
@@ -65,6 +65,9 @@
#define IS_KUNPENG920(midr) (MIDR_IMPLEMENTOR(midr) == 'H' \
&& MIDR_PARTNUM(midr) == 0xd01)
+#define IS_A64FX(midr) (MIDR_IMPLEMENTOR(midr) == 'F' \
+ && MIDR_PARTNUM(midr) == 0x001)
+
struct cpu_features
{
uint64_t midr_el1;
@@ -72,6 +75,7 @@ struct cpu_features
bool bti;
/* Currently, the GLIBC memory tagging tunable only defines 8 bits. */
uint8_t mte_state;
+ bool sve;
};
#endif /* _CPU_FEATURES_AARCH64_H */