aarch64: Use memcpy_simd as the default memcpy
Checks
Commit Message
Since __memcpy_simd is the fastest memcpy on almost all cores, use it by default
if SVE is not available.
Passes regress, OK for commit?
---
Comments
On Wed, Oct 12, 2022 at 8:20 AM Wilco Dijkstra via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Since __memcpy_simd is the fastest memcpy on almost all cores, use it by default
> if SVE is not available.
>
> Passes regress, OK for commit?
>
> ---
> diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
> index 98d4e2c0e202eca13e1fd19ad8046cf61ad280ff..7b396b202fabf01b6ff2adc71a1038148e0b1054 100644
> --- a/sysdeps/aarch64/memcpy.S
> +++ b/sysdeps/aarch64/memcpy.S
> @@ -1,4 +1,5 @@
> -/* Copyright (C) 2012-2022 Free Software Foundation, Inc.
> +/* Generic optimized memcpy using SIMD.
> + Copyright (C) 2012-2022 Free Software Foundation, Inc.
>
> This file is part of the GNU C Library.
>
> @@ -20,7 +21,7 @@
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64, unaligned accesses.
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> *
> */
>
> @@ -36,21 +37,18 @@
> #define B_l x8
> #define B_lw w8
> #define B_h x9
> -#define C_l x10
> #define C_lw w10
> -#define C_h x11
> -#define D_l x12
> -#define D_h x13
> -#define E_l x14
> -#define E_h x15
> -#define F_l x16
> -#define F_h x17
> -#define G_l count
> -#define G_h dst
> -#define H_l src
> -#define H_h srcend
> #define tmp1 x14
>
> +#define A_q q0
> +#define B_q q1
> +#define C_q q2
> +#define D_q q3
> +#define E_q q4
> +#define F_q q5
> +#define G_q q6
> +#define H_q q7
> +
> #ifndef MEMMOVE
> # define MEMMOVE memmove
> #endif
> @@ -69,10 +67,9 @@
> Large copies use a software pipelined loop processing 64 bytes per
> iteration. The destination pointer is 16-byte aligned to minimize
> unaligned accesses. The loop tail is handled by always copying 64 bytes
> - from the end.
> -*/
> + from the end. */
>
> -ENTRY_ALIGN (MEMCPY, 6)
> +ENTRY (MEMCPY)
> PTR_ARG (0)
> PTR_ARG (1)
> SIZE_ARG (2)
> @@ -87,10 +84,10 @@ ENTRY_ALIGN (MEMCPY, 6)
> /* Small copies: 0..32 bytes. */
> cmp count, 16
> b.lo L(copy16)
> - ldp A_l, A_h, [src]
> - ldp D_l, D_h, [srcend, -16]
> - stp A_l, A_h, [dstin]
> - stp D_l, D_h, [dstend, -16]
> + ldr A_q, [src]
> + ldr B_q, [srcend, -16]
> + str A_q, [dstin]
> + str B_q, [dstend, -16]
> ret
>
> /* Copy 8-15 bytes. */
> @@ -102,7 +99,6 @@ L(copy16):
> str A_h, [dstend, -8]
> ret
>
> - .p2align 3
> /* Copy 4-7 bytes. */
> L(copy8):
> tbz count, 2, L(copy4)
> @@ -128,87 +124,69 @@ L(copy0):
> .p2align 4
> /* Medium copies: 33..128 bytes. */
> L(copy32_128):
> - ldp A_l, A_h, [src]
> - ldp B_l, B_h, [src, 16]
> - ldp C_l, C_h, [srcend, -32]
> - ldp D_l, D_h, [srcend, -16]
> + ldp A_q, B_q, [src]
> + ldp C_q, D_q, [srcend, -32]
> cmp count, 64
> b.hi L(copy128)
> - stp A_l, A_h, [dstin]
> - stp B_l, B_h, [dstin, 16]
> - stp C_l, C_h, [dstend, -32]
> - stp D_l, D_h, [dstend, -16]
> + stp A_q, B_q, [dstin]
> + stp C_q, D_q, [dstend, -32]
> ret
>
> .p2align 4
> /* Copy 65..128 bytes. */
> L(copy128):
> - ldp E_l, E_h, [src, 32]
> - ldp F_l, F_h, [src, 48]
> + ldp E_q, F_q, [src, 32]
> cmp count, 96
> b.ls L(copy96)
> - ldp G_l, G_h, [srcend, -64]
> - ldp H_l, H_h, [srcend, -48]
> - stp G_l, G_h, [dstend, -64]
> - stp H_l, H_h, [dstend, -48]
> + ldp G_q, H_q, [srcend, -64]
> + stp G_q, H_q, [dstend, -64]
> L(copy96):
> - stp A_l, A_h, [dstin]
> - stp B_l, B_h, [dstin, 16]
> - stp E_l, E_h, [dstin, 32]
> - stp F_l, F_h, [dstin, 48]
> - stp C_l, C_h, [dstend, -32]
> - stp D_l, D_h, [dstend, -16]
> + stp A_q, B_q, [dstin]
> + stp E_q, F_q, [dstin, 32]
> + stp C_q, D_q, [dstend, -32]
> ret
>
> - .p2align 4
> + /* Align loop64 below to 16 bytes. */
> + nop
> +
> /* Copy more than 128 bytes. */
> L(copy_long):
> - /* Copy 16 bytes and then align dst to 16-byte alignment. */
> - ldp D_l, D_h, [src]
> - and tmp1, dstin, 15
> - bic dst, dstin, 15
> - sub src, src, tmp1
> + /* Copy 16 bytes and then align src to 16-byte alignment. */
> + ldr D_q, [src]
> + and tmp1, src, 15
> + bic src, src, 15
> + sub dst, dstin, tmp1
> add count, count, tmp1 /* Count is now 16 too large. */
> - ldp A_l, A_h, [src, 16]
> - stp D_l, D_h, [dstin]
> - ldp B_l, B_h, [src, 32]
> - ldp C_l, C_h, [src, 48]
> - ldp D_l, D_h, [src, 64]!
> + ldp A_q, B_q, [src, 16]
> + str D_q, [dstin]
> + ldp C_q, D_q, [src, 48]
> subs count, count, 128 + 16 /* Test and readjust count. */
> b.ls L(copy64_from_end)
> -
> L(loop64):
> - stp A_l, A_h, [dst, 16]
> - ldp A_l, A_h, [src, 16]
> - stp B_l, B_h, [dst, 32]
> - ldp B_l, B_h, [src, 32]
> - stp C_l, C_h, [dst, 48]
> - ldp C_l, C_h, [src, 48]
> - stp D_l, D_h, [dst, 64]!
> - ldp D_l, D_h, [src, 64]!
> + stp A_q, B_q, [dst, 16]
> + ldp A_q, B_q, [src, 80]
> + stp C_q, D_q, [dst, 48]
> + ldp C_q, D_q, [src, 112]
> + add src, src, 64
> + add dst, dst, 64
> subs count, count, 64
> b.hi L(loop64)
>
> /* Write the last iteration and copy 64 bytes from the end. */
> L(copy64_from_end):
> - ldp E_l, E_h, [srcend, -64]
> - stp A_l, A_h, [dst, 16]
> - ldp A_l, A_h, [srcend, -48]
> - stp B_l, B_h, [dst, 32]
> - ldp B_l, B_h, [srcend, -32]
> - stp C_l, C_h, [dst, 48]
> - ldp C_l, C_h, [srcend, -16]
> - stp D_l, D_h, [dst, 64]
> - stp E_l, E_h, [dstend, -64]
> - stp A_l, A_h, [dstend, -48]
> - stp B_l, B_h, [dstend, -32]
> - stp C_l, C_h, [dstend, -16]
> + ldp E_q, F_q, [srcend, -64]
> + stp A_q, B_q, [dst, 16]
> + ldp A_q, B_q, [srcend, -32]
> + stp C_q, D_q, [dst, 48]
> + stp E_q, F_q, [dstend, -64]
> + stp A_q, B_q, [dstend, -32]
> ret
>
> END (MEMCPY)
> libc_hidden_builtin_def (MEMCPY)
>
> -ENTRY_ALIGN (MEMMOVE, 4)
> +
> +ENTRY (MEMMOVE)
> PTR_ARG (0)
> PTR_ARG (1)
> SIZE_ARG (2)
> @@ -220,64 +198,56 @@ ENTRY_ALIGN (MEMMOVE, 4)
> cmp count, 32
> b.hi L(copy32_128)
>
> - /* Small copies: 0..32 bytes. */
> + /* Small moves: 0..32 bytes. */
> cmp count, 16
> b.lo L(copy16)
> - ldp A_l, A_h, [src]
> - ldp D_l, D_h, [srcend, -16]
> - stp A_l, A_h, [dstin]
> - stp D_l, D_h, [dstend, -16]
> + ldr A_q, [src]
> + ldr B_q, [srcend, -16]
> + str A_q, [dstin]
> + str B_q, [dstend, -16]
> ret
>
> - .p2align 4
> L(move_long):
> /* Only use backward copy if there is an overlap. */
> sub tmp1, dstin, src
> - cbz tmp1, L(copy0)
> + cbz tmp1, L(move0)
> cmp tmp1, count
> b.hs L(copy_long)
>
> /* Large backwards copy for overlapping copies.
> - Copy 16 bytes and then align dst to 16-byte alignment. */
> - ldp D_l, D_h, [srcend, -16]
> - and tmp1, dstend, 15
> - sub srcend, srcend, tmp1
> + Copy 16 bytes and then align srcend to 16-byte alignment. */
> +L(copy_long_backwards):
> + ldr D_q, [srcend, -16]
> + and tmp1, srcend, 15
> + bic srcend, srcend, 15
> sub count, count, tmp1
> - ldp A_l, A_h, [srcend, -16]
> - stp D_l, D_h, [dstend, -16]
> - ldp B_l, B_h, [srcend, -32]
> - ldp C_l, C_h, [srcend, -48]
> - ldp D_l, D_h, [srcend, -64]!
> + ldp A_q, B_q, [srcend, -32]
> + str D_q, [dstend, -16]
> + ldp C_q, D_q, [srcend, -64]
> sub dstend, dstend, tmp1
> subs count, count, 128
> b.ls L(copy64_from_start)
>
> L(loop64_backwards):
> - stp A_l, A_h, [dstend, -16]
> - ldp A_l, A_h, [srcend, -16]
> - stp B_l, B_h, [dstend, -32]
> - ldp B_l, B_h, [srcend, -32]
> - stp C_l, C_h, [dstend, -48]
> - ldp C_l, C_h, [srcend, -48]
> - stp D_l, D_h, [dstend, -64]!
> - ldp D_l, D_h, [srcend, -64]!
> + str B_q, [dstend, -16]
> + str A_q, [dstend, -32]
> + ldp A_q, B_q, [srcend, -96]
> + str D_q, [dstend, -48]
> + str C_q, [dstend, -64]!
> + ldp C_q, D_q, [srcend, -128]
> + sub srcend, srcend, 64
> subs count, count, 64
> b.hi L(loop64_backwards)
>
> /* Write the last iteration and copy 64 bytes from the start. */
> L(copy64_from_start):
> - ldp G_l, G_h, [src, 48]
> - stp A_l, A_h, [dstend, -16]
> - ldp A_l, A_h, [src, 32]
> - stp B_l, B_h, [dstend, -32]
> - ldp B_l, B_h, [src, 16]
> - stp C_l, C_h, [dstend, -48]
> - ldp C_l, C_h, [src]
> - stp D_l, D_h, [dstend, -64]
> - stp G_l, G_h, [dstin, 48]
> - stp A_l, A_h, [dstin, 32]
> - stp B_l, B_h, [dstin, 16]
> - stp C_l, C_h, [dstin]
> + ldp E_q, F_q, [src, 32]
> + stp A_q, B_q, [dstend, -32]
> + ldp A_q, B_q, [src]
> + stp C_q, D_q, [dstend, -64]
> + stp E_q, F_q, [dstin, 32]
> + stp A_q, B_q, [dstin]
> +L(move0):
> ret
>
> END (MEMMOVE)
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index bc5cde8add07b908178fb0271decc27f728f7a2e..7f2d85b0e5acc0a694e91b17fbccc0dba0ea339d 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -3,7 +3,6 @@ sysdep_routines += \
> memchr_generic \
> memchr_nosimd \
> memcpy_a64fx \
> - memcpy_advsimd \
> memcpy_generic \
> memcpy_sve \
> memcpy_thunderx \
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 9c2542de38fb109b7c6f1db4aacee3a6b544fa3f..e7c4dcc0ed5a68ecd8dacc06256d0749b76912cb 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -36,7 +36,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL (i, name, memcpy,
> IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
> IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2)
> - IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
> #if HAVE_AARCH64_SVE_ASM
> IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
> IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve)
> @@ -45,7 +44,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL (i, name, memmove,
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
> IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2)
> - IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
> #if HAVE_AARCH64_SVE_ASM
> IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
> IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve)
> diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> index 5006b0594a476bcc149f2ae022bea50379d04908..1e08ce852e68409fd0eeb975edab77ebe8da8635 100644
> --- a/sysdeps/aarch64/multiarch/memcpy.c
> +++ b/sysdeps/aarch64/multiarch/memcpy.c
> @@ -29,7 +29,6 @@
> extern __typeof (__redirect_memcpy) __libc_memcpy;
>
> extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
> -extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
> @@ -40,9 +39,6 @@ select_memcpy_ifunc (void)
> {
> INIT_ARCH ();
>
> - if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
> - return __memcpy_simd;
> -
> if (sve && HAVE_AARCH64_SVE_ASM)
> {
This changes how neoverse-n2 is handled, is that expected?
That is neoverse-n2 was returning __memcpy_simd before and now will be
returning __memcpy_sve as n2 has SVE.
Thanks,
Andrew Pinski
> if (IS_A64FX (midr))
> diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
> deleted file mode 100644
> index fe9beaf5ead47268867bee98acad3b17c554656a..0000000000000000000000000000000000000000
> --- a/sysdeps/aarch64/multiarch/memcpy_advsimd.S
> +++ /dev/null
> @@ -1,248 +0,0 @@
> -/* Generic optimized memcpy using SIMD.
> - Copyright (C) 2020-2022 Free Software Foundation, Inc.
> -
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library. If not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -
> -/* Assumptions:
> - *
> - * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> - *
> - */
> -
> -#define dstin x0
> -#define src x1
> -#define count x2
> -#define dst x3
> -#define srcend x4
> -#define dstend x5
> -#define A_l x6
> -#define A_lw w6
> -#define A_h x7
> -#define B_l x8
> -#define B_lw w8
> -#define B_h x9
> -#define C_lw w10
> -#define tmp1 x14
> -
> -#define A_q q0
> -#define B_q q1
> -#define C_q q2
> -#define D_q q3
> -#define E_q q4
> -#define F_q q5
> -#define G_q q6
> -#define H_q q7
> -
> -
> -/* This implementation supports both memcpy and memmove and shares most code.
> - It uses unaligned accesses and branchless sequences to keep the code small,
> - simple and improve performance.
> -
> - Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> - copies of up to 128 bytes, and large copies. The overhead of the overlap
> - check in memmove is negligible since it is only required for large copies.
> -
> - Large copies use a software pipelined loop processing 64 bytes per
> - iteration. The destination pointer is 16-byte aligned to minimize
> - unaligned accesses. The loop tail is handled by always copying 64 bytes
> - from the end. */
> -
> -ENTRY (__memcpy_simd)
> - PTR_ARG (0)
> - PTR_ARG (1)
> - SIZE_ARG (2)
> -
> - add srcend, src, count
> - add dstend, dstin, count
> - cmp count, 128
> - b.hi L(copy_long)
> - cmp count, 32
> - b.hi L(copy32_128)
> -
> - /* Small copies: 0..32 bytes. */
> - cmp count, 16
> - b.lo L(copy16)
> - ldr A_q, [src]
> - ldr B_q, [srcend, -16]
> - str A_q, [dstin]
> - str B_q, [dstend, -16]
> - ret
> -
> - /* Copy 8-15 bytes. */
> -L(copy16):
> - tbz count, 3, L(copy8)
> - ldr A_l, [src]
> - ldr A_h, [srcend, -8]
> - str A_l, [dstin]
> - str A_h, [dstend, -8]
> - ret
> -
> - /* Copy 4-7 bytes. */
> -L(copy8):
> - tbz count, 2, L(copy4)
> - ldr A_lw, [src]
> - ldr B_lw, [srcend, -4]
> - str A_lw, [dstin]
> - str B_lw, [dstend, -4]
> - ret
> -
> - /* Copy 0..3 bytes using a branchless sequence. */
> -L(copy4):
> - cbz count, L(copy0)
> - lsr tmp1, count, 1
> - ldrb A_lw, [src]
> - ldrb C_lw, [srcend, -1]
> - ldrb B_lw, [src, tmp1]
> - strb A_lw, [dstin]
> - strb B_lw, [dstin, tmp1]
> - strb C_lw, [dstend, -1]
> -L(copy0):
> - ret
> -
> - .p2align 4
> - /* Medium copies: 33..128 bytes. */
> -L(copy32_128):
> - ldp A_q, B_q, [src]
> - ldp C_q, D_q, [srcend, -32]
> - cmp count, 64
> - b.hi L(copy128)
> - stp A_q, B_q, [dstin]
> - stp C_q, D_q, [dstend, -32]
> - ret
> -
> - .p2align 4
> - /* Copy 65..128 bytes. */
> -L(copy128):
> - ldp E_q, F_q, [src, 32]
> - cmp count, 96
> - b.ls L(copy96)
> - ldp G_q, H_q, [srcend, -64]
> - stp G_q, H_q, [dstend, -64]
> -L(copy96):
> - stp A_q, B_q, [dstin]
> - stp E_q, F_q, [dstin, 32]
> - stp C_q, D_q, [dstend, -32]
> - ret
> -
> - /* Align loop64 below to 16 bytes. */
> - nop
> -
> - /* Copy more than 128 bytes. */
> -L(copy_long):
> - /* Copy 16 bytes and then align src to 16-byte alignment. */
> - ldr D_q, [src]
> - and tmp1, src, 15
> - bic src, src, 15
> - sub dst, dstin, tmp1
> - add count, count, tmp1 /* Count is now 16 too large. */
> - ldp A_q, B_q, [src, 16]
> - str D_q, [dstin]
> - ldp C_q, D_q, [src, 48]
> - subs count, count, 128 + 16 /* Test and readjust count. */
> - b.ls L(copy64_from_end)
> -L(loop64):
> - stp A_q, B_q, [dst, 16]
> - ldp A_q, B_q, [src, 80]
> - stp C_q, D_q, [dst, 48]
> - ldp C_q, D_q, [src, 112]
> - add src, src, 64
> - add dst, dst, 64
> - subs count, count, 64
> - b.hi L(loop64)
> -
> - /* Write the last iteration and copy 64 bytes from the end. */
> -L(copy64_from_end):
> - ldp E_q, F_q, [srcend, -64]
> - stp A_q, B_q, [dst, 16]
> - ldp A_q, B_q, [srcend, -32]
> - stp C_q, D_q, [dst, 48]
> - stp E_q, F_q, [dstend, -64]
> - stp A_q, B_q, [dstend, -32]
> - ret
> -
> -END (__memcpy_simd)
> -libc_hidden_builtin_def (__memcpy_simd)
> -
> -
> -ENTRY (__memmove_simd)
> - PTR_ARG (0)
> - PTR_ARG (1)
> - SIZE_ARG (2)
> -
> - add srcend, src, count
> - add dstend, dstin, count
> - cmp count, 128
> - b.hi L(move_long)
> - cmp count, 32
> - b.hi L(copy32_128)
> -
> - /* Small moves: 0..32 bytes. */
> - cmp count, 16
> - b.lo L(copy16)
> - ldr A_q, [src]
> - ldr B_q, [srcend, -16]
> - str A_q, [dstin]
> - str B_q, [dstend, -16]
> - ret
> -
> -L(move_long):
> - /* Only use backward copy if there is an overlap. */
> - sub tmp1, dstin, src
> - cbz tmp1, L(move0)
> - cmp tmp1, count
> - b.hs L(copy_long)
> -
> - /* Large backwards copy for overlapping copies.
> - Copy 16 bytes and then align srcend to 16-byte alignment. */
> -L(copy_long_backwards):
> - ldr D_q, [srcend, -16]
> - and tmp1, srcend, 15
> - bic srcend, srcend, 15
> - sub count, count, tmp1
> - ldp A_q, B_q, [srcend, -32]
> - str D_q, [dstend, -16]
> - ldp C_q, D_q, [srcend, -64]
> - sub dstend, dstend, tmp1
> - subs count, count, 128
> - b.ls L(copy64_from_start)
> -
> -L(loop64_backwards):
> - str B_q, [dstend, -16]
> - str A_q, [dstend, -32]
> - ldp A_q, B_q, [srcend, -96]
> - str D_q, [dstend, -48]
> - str C_q, [dstend, -64]!
> - ldp C_q, D_q, [srcend, -128]
> - sub srcend, srcend, 64
> - subs count, count, 64
> - b.hi L(loop64_backwards)
> -
> - /* Write the last iteration and copy 64 bytes from the start. */
> -L(copy64_from_start):
> - ldp E_q, F_q, [src, 32]
> - stp A_q, B_q, [dstend, -32]
> - ldp A_q, B_q, [src]
> - stp C_q, D_q, [dstend, -64]
> - stp E_q, F_q, [dstin, 32]
> - stp A_q, B_q, [dstin]
> -L(move0):
> - ret
> -
> -END (__memmove_simd)
> -libc_hidden_builtin_def (__memmove_simd)
> diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
> index 7dae8b7c956f9083d0896cc771cae79f4901581d..dbf1536525e614f72d3d74bb193015b303618357 100644
> --- a/sysdeps/aarch64/multiarch/memmove.c
> +++ b/sysdeps/aarch64/multiarch/memmove.c
> @@ -29,7 +29,6 @@
> extern __typeof (__redirect_memmove) __libc_memmove;
>
> extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
> -extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
> extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
> extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
> extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
> @@ -40,9 +39,6 @@ select_memmove_ifunc (void)
> {
> INIT_ARCH ();
>
> - if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
> - return __memmove_simd;
> -
> if (sve && HAVE_AARCH64_SVE_ASM)
> {
> if (IS_A64FX (midr))
>
Hi Andrew,
> This changes how neoverse-n2 is handled, is that expected?
> That is neoverse-n2 was returning __memcpy_simd before and now will be
> returning __memcpy_sve as n2 has SVE.
Yes, the SVE memcpy can be used by any CPU that supports SVE. Having this
as a general rule is better than special casing every CPU.
Cheers,
Wilco
On 13/10/22 09:28, Wilco Dijkstra via Libc-alpha wrote:
> Hi Andrew,
>
>> This changes how neoverse-n2 is handled, is that expected?
>> That is neoverse-n2 was returning __memcpy_simd before and now will be
>> returning __memcpy_sve as n2 has SVE.
>
> Yes, the SVE memcpy can be used by any CPU that supports SVE. Having this
> as a general rule is better than special casing every CPU.
>
> Cheers,
> Wilco
Maybe move this change to a different patch?
Hi Adhemerval,
On 13/10/22 09:28, Wilco Dijkstra via Libc-alpha wrote:
> Hi Andrew,
>
>> This changes how neoverse-n2 is handled, is that expected?
>> That is neoverse-n2 was returning __memcpy_simd before and now will be
>> returning __memcpy_sve as n2 has SVE.
>
> Yes, the SVE memcpy can be used by any CPU that supports SVE. Having this
> as a general rule is better than special casing every CPU.
> Maybe move this change to a different patch?
That if statement made no sense after the change, so I removed it altogether. Either
way, it doesn't seem large or important enough to warrant a separate patch. I could
add a note in the commit log, eg:
Since __memcpy_simd is the fastest memcpy on almost all cores, use it by default.
If SVE is available, a SVE memcpy will be used by default (including Neoverse N2).
Cheers,
Wilco
The 10/19/2022 12:31, Wilco Dijkstra via Libc-alpha wrote:
> Hi Adhemerval,
>
> On 13/10/22 09:28, Wilco Dijkstra via Libc-alpha wrote:
> > Hi Andrew,
> >
> >> This changes how neoverse-n2 is handled, is that expected?
> >> That is neoverse-n2 was returning __memcpy_simd before and now will be
> >> returning __memcpy_sve as n2 has SVE.
> >
> > Yes, the SVE memcpy can be used by any CPU that supports SVE. Having this
> > as a general rule is better than special casing every CPU.
>
> > Maybe move this change to a different patch?
>
> That if statement made no sense after the change, so I removed it altogether. Either
> way, it doesn't seem large or important enough to warrant a separate patch. I could
> add a note in the commit log, eg:
>
> Since __memcpy_simd is the fastest memcpy on almost all cores, use it by default.
> If SVE is available, a SVE memcpy will be used by default (including Neoverse N2).
the patch is OK to commit with this note.
thanks.
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2022 Free Software Foundation, Inc.
+/* Generic optimized memcpy using SIMD.
+ Copyright (C) 2012-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -20,7 +21,7 @@
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*
*/
@@ -36,21 +37,18 @@
#define B_l x8
#define B_lw w8
#define B_h x9
-#define C_l x10
#define C_lw w10
-#define C_h x11
-#define D_l x12
-#define D_h x13
-#define E_l x14
-#define E_h x15
-#define F_l x16
-#define F_h x17
-#define G_l count
-#define G_h dst
-#define H_l src
-#define H_h srcend
#define tmp1 x14
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+#define H_q q7
+
#ifndef MEMMOVE
# define MEMMOVE memmove
#endif
@@ -69,10 +67,9 @@
Large copies use a software pipelined loop processing 64 bytes per
iteration. The destination pointer is 16-byte aligned to minimize
unaligned accesses. The loop tail is handled by always copying 64 bytes
- from the end.
-*/
+ from the end. */
-ENTRY_ALIGN (MEMCPY, 6)
+ENTRY (MEMCPY)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
@@ -87,10 +84,10 @@ ENTRY_ALIGN (MEMCPY, 6)
/* Small copies: 0..32 bytes. */
cmp count, 16
b.lo L(copy16)
- ldp A_l, A_h, [src]
- ldp D_l, D_h, [srcend, -16]
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
+ ldr A_q, [src]
+ ldr B_q, [srcend, -16]
+ str A_q, [dstin]
+ str B_q, [dstend, -16]
ret
/* Copy 8-15 bytes. */
@@ -102,7 +99,6 @@ L(copy16):
str A_h, [dstend, -8]
ret
- .p2align 3
/* Copy 4-7 bytes. */
L(copy8):
tbz count, 2, L(copy4)
@@ -128,87 +124,69 @@ L(copy0):
.p2align 4
/* Medium copies: 33..128 bytes. */
L(copy32_128):
- ldp A_l, A_h, [src]
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- ldp D_l, D_h, [srcend, -16]
+ ldp A_q, B_q, [src]
+ ldp C_q, D_q, [srcend, -32]
cmp count, 64
b.hi L(copy128)
- stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
- stp D_l, D_h, [dstend, -16]
+ stp A_q, B_q, [dstin]
+ stp C_q, D_q, [dstend, -32]
ret
.p2align 4
/* Copy 65..128 bytes. */
L(copy128):
- ldp E_l, E_h, [src, 32]
- ldp F_l, F_h, [src, 48]
+ ldp E_q, F_q, [src, 32]
cmp count, 96
b.ls L(copy96)
- ldp G_l, G_h, [srcend, -64]
- ldp H_l, H_h, [srcend, -48]
- stp G_l, G_h, [dstend, -64]
- stp H_l, H_h, [dstend, -48]
+ ldp G_q, H_q, [srcend, -64]
+ stp G_q, H_q, [dstend, -64]
L(copy96):
- stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstin, 16]
- stp E_l, E_h, [dstin, 32]
- stp F_l, F_h, [dstin, 48]
- stp C_l, C_h, [dstend, -32]
- stp D_l, D_h, [dstend, -16]
+ stp A_q, B_q, [dstin]
+ stp E_q, F_q, [dstin, 32]
+ stp C_q, D_q, [dstend, -32]
ret
- .p2align 4
+ /* Align loop64 below to 16 bytes. */
+ nop
+
/* Copy more than 128 bytes. */
L(copy_long):
- /* Copy 16 bytes and then align dst to 16-byte alignment. */
- ldp D_l, D_h, [src]
- and tmp1, dstin, 15
- bic dst, dstin, 15
- sub src, src, tmp1
+ /* Copy 16 bytes and then align src to 16-byte alignment. */
+ ldr D_q, [src]
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
- ldp A_l, A_h, [src, 16]
- stp D_l, D_h, [dstin]
- ldp B_l, B_h, [src, 32]
- ldp C_l, C_h, [src, 48]
- ldp D_l, D_h, [src, 64]!
+ ldp A_q, B_q, [src, 16]
+ str D_q, [dstin]
+ ldp C_q, D_q, [src, 48]
subs count, count, 128 + 16 /* Test and readjust count. */
b.ls L(copy64_from_end)
-
L(loop64):
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [src, 16]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [src, 32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [src, 48]
- stp D_l, D_h, [dst, 64]!
- ldp D_l, D_h, [src, 64]!
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [src, 80]
+ stp C_q, D_q, [dst, 48]
+ ldp C_q, D_q, [src, 112]
+ add src, src, 64
+ add dst, dst, 64
subs count, count, 64
b.hi L(loop64)
/* Write the last iteration and copy 64 bytes from the end. */
L(copy64_from_end):
- ldp E_l, E_h, [srcend, -64]
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [srcend, -48]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [srcend, -16]
- stp D_l, D_h, [dst, 64]
- stp E_l, E_h, [dstend, -64]
- stp A_l, A_h, [dstend, -48]
- stp B_l, B_h, [dstend, -32]
- stp C_l, C_h, [dstend, -16]
+ ldp E_q, F_q, [srcend, -64]
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [srcend, -32]
+ stp C_q, D_q, [dst, 48]
+ stp E_q, F_q, [dstend, -64]
+ stp A_q, B_q, [dstend, -32]
ret
END (MEMCPY)
libc_hidden_builtin_def (MEMCPY)
-ENTRY_ALIGN (MEMMOVE, 4)
+
+ENTRY (MEMMOVE)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
@@ -220,64 +198,56 @@ ENTRY_ALIGN (MEMMOVE, 4)
cmp count, 32
b.hi L(copy32_128)
- /* Small copies: 0..32 bytes. */
+ /* Small moves: 0..32 bytes. */
cmp count, 16
b.lo L(copy16)
- ldp A_l, A_h, [src]
- ldp D_l, D_h, [srcend, -16]
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
+ ldr A_q, [src]
+ ldr B_q, [srcend, -16]
+ str A_q, [dstin]
+ str B_q, [dstend, -16]
ret
- .p2align 4
L(move_long):
/* Only use backward copy if there is an overlap. */
sub tmp1, dstin, src
- cbz tmp1, L(copy0)
+ cbz tmp1, L(move0)
cmp tmp1, count
b.hs L(copy_long)
/* Large backwards copy for overlapping copies.
- Copy 16 bytes and then align dst to 16-byte alignment. */
- ldp D_l, D_h, [srcend, -16]
- and tmp1, dstend, 15
- sub srcend, srcend, tmp1
+ Copy 16 bytes and then align srcend to 16-byte alignment. */
+L(copy_long_backwards):
+ ldr D_q, [srcend, -16]
+ and tmp1, srcend, 15
+ bic srcend, srcend, 15
sub count, count, tmp1
- ldp A_l, A_h, [srcend, -16]
- stp D_l, D_h, [dstend, -16]
- ldp B_l, B_h, [srcend, -32]
- ldp C_l, C_h, [srcend, -48]
- ldp D_l, D_h, [srcend, -64]!
+ ldp A_q, B_q, [srcend, -32]
+ str D_q, [dstend, -16]
+ ldp C_q, D_q, [srcend, -64]
sub dstend, dstend, tmp1
subs count, count, 128
b.ls L(copy64_from_start)
L(loop64_backwards):
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [srcend, -16]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [srcend, -48]
- stp D_l, D_h, [dstend, -64]!
- ldp D_l, D_h, [srcend, -64]!
+ str B_q, [dstend, -16]
+ str A_q, [dstend, -32]
+ ldp A_q, B_q, [srcend, -96]
+ str D_q, [dstend, -48]
+ str C_q, [dstend, -64]!
+ ldp C_q, D_q, [srcend, -128]
+ sub srcend, srcend, 64
subs count, count, 64
b.hi L(loop64_backwards)
/* Write the last iteration and copy 64 bytes from the start. */
L(copy64_from_start):
- ldp G_l, G_h, [src, 48]
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [src, 32]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [src, 16]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [src]
- stp D_l, D_h, [dstend, -64]
- stp G_l, G_h, [dstin, 48]
- stp A_l, A_h, [dstin, 32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin]
+ ldp E_q, F_q, [src, 32]
+ stp A_q, B_q, [dstend, -32]
+ ldp A_q, B_q, [src]
+ stp C_q, D_q, [dstend, -64]
+ stp E_q, F_q, [dstin, 32]
+ stp A_q, B_q, [dstin]
+L(move0):
ret
END (MEMMOVE)
@@ -3,7 +3,6 @@ sysdep_routines += \
memchr_generic \
memchr_nosimd \
memcpy_a64fx \
- memcpy_advsimd \
memcpy_generic \
memcpy_sve \
memcpy_thunderx \
@@ -36,7 +36,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, memcpy,
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2)
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
#if HAVE_AARCH64_SVE_ASM
IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve)
@@ -45,7 +44,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, memmove,
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2)
- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
#if HAVE_AARCH64_SVE_ASM
IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve)
@@ -29,7 +29,6 @@
extern __typeof (__redirect_memcpy) __libc_memcpy;
extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
-extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
@@ -40,9 +39,6 @@ select_memcpy_ifunc (void)
{
INIT_ARCH ();
- if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
- return __memcpy_simd;
-
if (sve && HAVE_AARCH64_SVE_ASM)
{
if (IS_A64FX (midr))
deleted file mode 100644
@@ -1,248 +0,0 @@
-/* Generic optimized memcpy using SIMD.
- Copyright (C) 2020-2022 Free Software Foundation, Inc.
-
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library. If not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
- *
- */
-
-#define dstin x0
-#define src x1
-#define count x2
-#define dst x3
-#define srcend x4
-#define dstend x5
-#define A_l x6
-#define A_lw w6
-#define A_h x7
-#define B_l x8
-#define B_lw w8
-#define B_h x9
-#define C_lw w10
-#define tmp1 x14
-
-#define A_q q0
-#define B_q q1
-#define C_q q2
-#define D_q q3
-#define E_q q4
-#define F_q q5
-#define G_q q6
-#define H_q q7
-
-
-/* This implementation supports both memcpy and memmove and shares most code.
- It uses unaligned accesses and branchless sequences to keep the code small,
- simple and improve performance.
-
- Copies are split into 3 main cases: small copies of up to 32 bytes, medium
- copies of up to 128 bytes, and large copies. The overhead of the overlap
- check in memmove is negligible since it is only required for large copies.
-
- Large copies use a software pipelined loop processing 64 bytes per
- iteration. The destination pointer is 16-byte aligned to minimize
- unaligned accesses. The loop tail is handled by always copying 64 bytes
- from the end. */
-
-ENTRY (__memcpy_simd)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
- add srcend, src, count
- add dstend, dstin, count
- cmp count, 128
- b.hi L(copy_long)
- cmp count, 32
- b.hi L(copy32_128)
-
- /* Small copies: 0..32 bytes. */
- cmp count, 16
- b.lo L(copy16)
- ldr A_q, [src]
- ldr B_q, [srcend, -16]
- str A_q, [dstin]
- str B_q, [dstend, -16]
- ret
-
- /* Copy 8-15 bytes. */
-L(copy16):
- tbz count, 3, L(copy8)
- ldr A_l, [src]
- ldr A_h, [srcend, -8]
- str A_l, [dstin]
- str A_h, [dstend, -8]
- ret
-
- /* Copy 4-7 bytes. */
-L(copy8):
- tbz count, 2, L(copy4)
- ldr A_lw, [src]
- ldr B_lw, [srcend, -4]
- str A_lw, [dstin]
- str B_lw, [dstend, -4]
- ret
-
- /* Copy 0..3 bytes using a branchless sequence. */
-L(copy4):
- cbz count, L(copy0)
- lsr tmp1, count, 1
- ldrb A_lw, [src]
- ldrb C_lw, [srcend, -1]
- ldrb B_lw, [src, tmp1]
- strb A_lw, [dstin]
- strb B_lw, [dstin, tmp1]
- strb C_lw, [dstend, -1]
-L(copy0):
- ret
-
- .p2align 4
- /* Medium copies: 33..128 bytes. */
-L(copy32_128):
- ldp A_q, B_q, [src]
- ldp C_q, D_q, [srcend, -32]
- cmp count, 64
- b.hi L(copy128)
- stp A_q, B_q, [dstin]
- stp C_q, D_q, [dstend, -32]
- ret
-
- .p2align 4
- /* Copy 65..128 bytes. */
-L(copy128):
- ldp E_q, F_q, [src, 32]
- cmp count, 96
- b.ls L(copy96)
- ldp G_q, H_q, [srcend, -64]
- stp G_q, H_q, [dstend, -64]
-L(copy96):
- stp A_q, B_q, [dstin]
- stp E_q, F_q, [dstin, 32]
- stp C_q, D_q, [dstend, -32]
- ret
-
- /* Align loop64 below to 16 bytes. */
- nop
-
- /* Copy more than 128 bytes. */
-L(copy_long):
- /* Copy 16 bytes and then align src to 16-byte alignment. */
- ldr D_q, [src]
- and tmp1, src, 15
- bic src, src, 15
- sub dst, dstin, tmp1
- add count, count, tmp1 /* Count is now 16 too large. */
- ldp A_q, B_q, [src, 16]
- str D_q, [dstin]
- ldp C_q, D_q, [src, 48]
- subs count, count, 128 + 16 /* Test and readjust count. */
- b.ls L(copy64_from_end)
-L(loop64):
- stp A_q, B_q, [dst, 16]
- ldp A_q, B_q, [src, 80]
- stp C_q, D_q, [dst, 48]
- ldp C_q, D_q, [src, 112]
- add src, src, 64
- add dst, dst, 64
- subs count, count, 64
- b.hi L(loop64)
-
- /* Write the last iteration and copy 64 bytes from the end. */
-L(copy64_from_end):
- ldp E_q, F_q, [srcend, -64]
- stp A_q, B_q, [dst, 16]
- ldp A_q, B_q, [srcend, -32]
- stp C_q, D_q, [dst, 48]
- stp E_q, F_q, [dstend, -64]
- stp A_q, B_q, [dstend, -32]
- ret
-
-END (__memcpy_simd)
-libc_hidden_builtin_def (__memcpy_simd)
-
-
-ENTRY (__memmove_simd)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
- add srcend, src, count
- add dstend, dstin, count
- cmp count, 128
- b.hi L(move_long)
- cmp count, 32
- b.hi L(copy32_128)
-
- /* Small moves: 0..32 bytes. */
- cmp count, 16
- b.lo L(copy16)
- ldr A_q, [src]
- ldr B_q, [srcend, -16]
- str A_q, [dstin]
- str B_q, [dstend, -16]
- ret
-
-L(move_long):
- /* Only use backward copy if there is an overlap. */
- sub tmp1, dstin, src
- cbz tmp1, L(move0)
- cmp tmp1, count
- b.hs L(copy_long)
-
- /* Large backwards copy for overlapping copies.
- Copy 16 bytes and then align srcend to 16-byte alignment. */
-L(copy_long_backwards):
- ldr D_q, [srcend, -16]
- and tmp1, srcend, 15
- bic srcend, srcend, 15
- sub count, count, tmp1
- ldp A_q, B_q, [srcend, -32]
- str D_q, [dstend, -16]
- ldp C_q, D_q, [srcend, -64]
- sub dstend, dstend, tmp1
- subs count, count, 128
- b.ls L(copy64_from_start)
-
-L(loop64_backwards):
- str B_q, [dstend, -16]
- str A_q, [dstend, -32]
- ldp A_q, B_q, [srcend, -96]
- str D_q, [dstend, -48]
- str C_q, [dstend, -64]!
- ldp C_q, D_q, [srcend, -128]
- sub srcend, srcend, 64
- subs count, count, 64
- b.hi L(loop64_backwards)
-
- /* Write the last iteration and copy 64 bytes from the start. */
-L(copy64_from_start):
- ldp E_q, F_q, [src, 32]
- stp A_q, B_q, [dstend, -32]
- ldp A_q, B_q, [src]
- stp C_q, D_q, [dstend, -64]
- stp E_q, F_q, [dstin, 32]
- stp A_q, B_q, [dstin]
-L(move0):
- ret
-
-END (__memmove_simd)
-libc_hidden_builtin_def (__memmove_simd)
@@ -29,7 +29,6 @@
extern __typeof (__redirect_memmove) __libc_memmove;
extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
-extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
@@ -40,9 +39,6 @@ select_memmove_ifunc (void)
{
INIT_ARCH ();
- if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
- return __memmove_simd;
-
if (sve && HAVE_AARCH64_SVE_ASM)
{
if (IS_A64FX (midr))