AArch64: Add optimized Q-register memcpy

Message ID DB8PR08MB503633A48CF226FB3C5FF4F683610@DB8PR08MB5036.eurprd08.prod.outlook.com
State Committed
Headers
Series AArch64: Add optimized Q-register memcpy |

Commit Message

Wilco Dijkstra July 14, 2020, 4:33 p.m. UTC
  Add a new memcpy using 128-bit Q registers - this is faster on modern
cores and reduces codesize.  Similar to the generic memcpy, small cases
include copies up to 32 bytes.  64-128 byte copies are split into two
cases to improve performance of 64-96 byte copies.  Large copies align
the source rather than the destination.

bench-memcpy-random is ~9% faster than memcpy_falkor on Neoverse N1,
so make this memcpy the default on N1 (on Centriq it is 15% faster than
memcpy_falkor).

Passes GLIBC regression tests. OK for commit?

---
  

Comments

Carlos O'Donell July 14, 2020, 8:17 p.m. UTC | #1
On 7/14/20 12:33 PM, Wilco Dijkstra wrote:
> Add a new memcpy using 128-bit Q registers - this is faster on modern
> cores and reduces codesize.  Similar to the generic memcpy, small cases
> include copies up to 32 bytes.  64-128 byte copies are split into two
> cases to improve performance of 64-96 byte copies.  Large copies align
> the source rather than the destination.
> 
> bench-memcpy-random is ~9% faster than memcpy_falkor on Neoverse N1,
> so make this memcpy the default on N1 (on Centriq it is 15% faster than
> memcpy_falkor).
> 
> Passes GLIBC regression tests. OK for commit?

As release manager this is OK for 2.32 if Szabolcs says it's OK.
 
> ---
> 
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index 4377df0735287c210efd661188f9e6e3923c8003..e93c21e764a8d02b9f07f5030c31836a3f03f3e1 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -1,5 +1,5 @@
>  ifeq ($(subdir),string)
> -sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
> +sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \
>  		   memcpy_falkor \
>  		   memcpy_new \
>  		   memset_generic memset_falkor memset_emag memset_kunpeng \
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 0ccaf53e555e410569eb2be76ec7d5b4d7bc64a5..09feea97ea37ab923cf4a8557197d46adcd49204 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -42,11 +42,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
>  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
>  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
> +	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
>  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
>    IFUNC_IMPL (i, name, memmove,
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx2)
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
> +	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
>    IFUNC_IMPL (i, name, memset,
>  	      /* Enable this on non-falkor processors too so that other cores
> diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> index 2fafefd5d23fc1528031b5fe52098218ed603b89..e6f3ae116701097d71a02e2a1f6bfdadc1eec34a 100644
> --- a/sysdeps/aarch64/multiarch/memcpy.c
> +++ b/sysdeps/aarch64/multiarch/memcpy.c
> @@ -29,6 +29,7 @@
>  extern __typeof (__redirect_memcpy) __libc_memcpy;
>  
>  extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
> +extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
> @@ -36,11 +37,11 @@ extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
>  libc_ifunc (__libc_memcpy,
>              (IS_THUNDERX (midr)
>  	     ? __memcpy_thunderx
> -	     : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_ARES (midr) || IS_KUNPENG920 (midr)
> +	     : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_KUNPENG920 (midr)
>  		? __memcpy_falkor
>  		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
>  		  ? __memcpy_thunderx2
> -		  : __memcpy_generic))));
> +		  : (IS_ARES (midr) ? __memcpy_simd : __memcpy_generic)))));
>  
>  # undef memcpy
>  strong_alias (__libc_memcpy, memcpy);
> diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
> new file mode 100644
> index 0000000000000000000000000000000000000000..d4ba74777744c8bb5a83e43ab2d63ad8dab35203
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
> @@ -0,0 +1,247 @@
> +/* Generic optimized memcpy using SIMD.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> + *
> + */
> +
> +#define dstin	x0
> +#define src	x1
> +#define count	x2
> +#define dst	x3
> +#define srcend	x4
> +#define dstend	x5
> +#define A_l	x6
> +#define A_lw	w6
> +#define A_h	x7
> +#define B_l	x8
> +#define B_lw	w8
> +#define B_h	x9
> +#define C_lw	w10
> +#define tmp1	x14
> +
> +#define A_q	q0
> +#define B_q	q1
> +#define C_q	q2
> +#define D_q	q3
> +#define E_q	q4
> +#define F_q	q5
> +#define G_q	q6
> +#define H_q	q7
> +
> +
> +/* This implementation supports both memcpy and memmove and shares most code.
> +   It uses unaligned accesses and branchless sequences to keep the code small,
> +   simple and improve performance.
> +
> +   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> +   copies of up to 128 bytes, and large copies.  The overhead of the overlap
> +   check in memmove is negligible since it is only required for large copies.
> +
> +   Large copies use a software pipelined loop processing 64 bytes per
> +   iteration.  The destination pointer is 16-byte aligned to minimize
> +   unaligned accesses.  The loop tail is handled by always copying 64 bytes
> +   from the end.  */
> +
> +ENTRY (__memcpy_simd)
> +	DELOUSE (0)
> +	DELOUSE (1)
> +	DELOUSE (2)
> +
> +	add	srcend, src, count
> +	add	dstend, dstin, count
> +	cmp	count, 128
> +	b.hi	L(copy_long)
> +	cmp	count, 32
> +	b.hi	L(copy32_128)
> +
> +	/* Small copies: 0..32 bytes.  */
> +	cmp	count, 16
> +	b.lo	L(copy16)
> +	ldr	A_q, [src]
> +	ldr	B_q, [srcend, -16]
> +	str	A_q, [dstin]
> +	str	B_q, [dstend, -16]
> +	ret
> +
> +	/* Copy 8-15 bytes.  */
> +L(copy16):
> +	tbz	count, 3, L(copy8)
> +	ldr	A_l, [src]
> +	ldr	A_h, [srcend, -8]
> +	str	A_l, [dstin]
> +	str	A_h, [dstend, -8]
> +	ret
> +
> +	/* Copy 4-7 bytes.  */
> +L(copy8):
> +	tbz	count, 2, L(copy4)
> +	ldr	A_lw, [src]
> +	ldr	B_lw, [srcend, -4]
> +	str	A_lw, [dstin]
> +	str	B_lw, [dstend, -4]
> +	ret
> +
> +	/* Copy 0..3 bytes using a branchless sequence.  */
> +L(copy4):
> +	cbz	count, L(copy0)
> +	lsr	tmp1, count, 1
> +	ldrb	A_lw, [src]
> +	ldrb	C_lw, [srcend, -1]
> +	ldrb	B_lw, [src, tmp1]
> +	strb	A_lw, [dstin]
> +	strb	B_lw, [dstin, tmp1]
> +	strb	C_lw, [dstend, -1]
> +L(copy0):
> +	ret
> +
> +	.p2align 4
> +	/* Medium copies: 33..128 bytes.  */
> +L(copy32_128):
> +	ldp	A_q, B_q, [src]
> +	ldp	C_q, D_q, [srcend, -32]
> +	cmp	count, 64
> +	b.hi	L(copy128)
> +	stp	A_q, B_q, [dstin]
> +	stp	C_q, D_q, [dstend, -32]
> +	ret
> +
> +	.p2align 4
> +	/* Copy 65..128 bytes.  */
> +L(copy128):
> +	ldp	E_q, F_q, [src, 32]
> +	cmp	count, 96
> +	b.ls	L(copy96)
> +	ldp	G_q, H_q, [srcend, -64]
> +	stp	G_q, H_q, [dstend, -64]
> +L(copy96):
> +	stp	A_q, B_q, [dstin]
> +	stp	E_q, F_q, [dstin, 32]
> +	stp	C_q, D_q, [dstend, -32]
> +	ret
> +
> +	/* Align loop64 below to 16 bytes.  */
> +	nop
> +
> +	/* Copy more than 128 bytes.  */
> +L(copy_long):
> +	/* Copy 16 bytes and then align src to 16-byte alignment.  */
> +	ldr	D_q, [src]
> +	and	tmp1, src, 15
> +	bic	src, src, 15
> +	sub	dst, dstin, tmp1
> +	add	count, count, tmp1	/* Count is now 16 too large.  */
> +	ldp	A_q, B_q, [src, 16]
> +	str	D_q, [dstin]
> +	ldp	C_q, D_q, [src, 48]
> +	subs	count, count, 128 + 16	/* Test and readjust count.  */
> +	b.ls	L(copy64_from_end)
> +L(loop64):
> +	stp	A_q, B_q, [dst, 16]
> +	ldp	A_q, B_q, [src, 80]
> +	stp	C_q, D_q, [dst, 48]
> +	ldp	C_q, D_q, [src, 112]
> +	add	src, src, 64
> +	add	dst, dst, 64
> +	subs	count, count, 64
> +	b.hi	L(loop64)
> +
> +	/* Write the last iteration and copy 64 bytes from the end.  */
> +L(copy64_from_end):
> +	ldp	E_q, F_q, [srcend, -64]
> +	stp	A_q, B_q, [dst, 16]
> +	ldp	A_q, B_q, [srcend, -32]
> +	stp	C_q, D_q, [dst, 48]
> +	stp	E_q, F_q, [dstend, -64]
> +	stp	A_q, B_q, [dstend, -32]
> +	ret
> +
> +END (__memcpy_simd)
> +libc_hidden_builtin_def (__memcpy_simd)
> +
> +
> +ENTRY (__memmove_simd)
> +	DELOUSE (0)
> +	DELOUSE (1)
> +	DELOUSE (2)
> +
> +	add	srcend, src, count
> +	add	dstend, dstin, count
> +	cmp	count, 128
> +	b.hi	L(move_long)
> +	cmp	count, 32
> +	b.hi	L(copy32_128)
> +
> +	/* Small moves: 0..32 bytes.  */
> +	cmp	count, 16
> +	b.lo	L(copy16)
> +	ldr	A_q, [src]
> +	ldr	B_q, [srcend, -16]
> +	str	A_q, [dstin]
> +	str	B_q, [dstend, -16]
> +	ret
> +
> +L(move_long):
> +	/* Only use backward copy if there is an overlap.  */
> +	sub	tmp1, dstin, src
> +	cbz	tmp1, L(move0)
> +	cmp	tmp1, count
> +	b.hs	L(copy_long)
> +
> +	/* Large backwards copy for overlapping copies.
> +	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
> +L(copy_long_backwards):
> +	ldr	D_q, [srcend, -16]
> +	and	tmp1, srcend, 15
> +	bic	srcend, srcend, 15
> +	sub	count, count, tmp1
> +	ldp	A_q, B_q, [srcend, -32]
> +	str	D_q, [dstend, -16]
> +	ldp	C_q, D_q, [srcend, -64]
> +	sub	dstend, dstend, tmp1
> +	subs	count, count, 128
> +	b.ls	L(copy64_from_start)
> +
> +L(loop64_backwards):
> +	stp	A_q, B_q, [dstend, -32]
> +	ldp	A_q, B_q, [srcend, -96]
> +	stp	C_q, D_q, [dstend, -64]
> +	ldp	C_q, D_q, [srcend, -128]
> +	sub	srcend, srcend, 64
> +	sub	dstend, dstend, 64
> +	subs	count, count, 64
> +	b.hi	L(loop64_backwards)
> +
> +	/* Write the last iteration and copy 64 bytes from the start.  */
> +L(copy64_from_start):
> +	ldp	E_q, F_q, [src, 32]
> +	stp	A_q, B_q, [dstend, -32]
> +	ldp	A_q, B_q, [src]
> +	stp	C_q, D_q, [dstend, -64]
> +	stp	E_q, F_q, [dstin, 32]
> +	stp	A_q, B_q, [dstin]
> +L(move0):
> +	ret
> +
> +END (__memmove_simd)
> +libc_hidden_builtin_def (__memmove_simd)
> diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
> index ed5a47f6f83e7b0afcec60cb9fa0f09999eaacae..1229f8b89296eddd2e711490bb7fc0b35726b6f5 100644
> --- a/sysdeps/aarch64/multiarch/memmove.c
> +++ b/sysdeps/aarch64/multiarch/memmove.c
> @@ -29,6 +29,7 @@
>  extern __typeof (__redirect_memmove) __libc_memmove;
>  
>  extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
> +extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
> @@ -40,7 +41,7 @@ libc_ifunc (__libc_memmove,
>  		? __memmove_falkor
>  		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
>  		  ? __memmove_thunderx2
> -		  : __memmove_generic))));
> +		  : (IS_ARES (midr) ? __memmove_simd : __memmove_generic)))));
>  
>  # undef memmove
>  strong_alias (__libc_memmove, memmove);
> 
>
  
Szabolcs Nagy July 15, 2020, 8:16 a.m. UTC | #2
The 07/14/2020 16:17, Carlos O'Donell wrote:
> On 7/14/20 12:33 PM, Wilco Dijkstra wrote:
> > Add a new memcpy using 128-bit Q registers - this is faster on modern
> > cores and reduces codesize.  Similar to the generic memcpy, small cases
> > include copies up to 32 bytes.  64-128 byte copies are split into two
> > cases to improve performance of 64-96 byte copies.  Large copies align
> > the source rather than the destination.
> > 
> > bench-memcpy-random is ~9% faster than memcpy_falkor on Neoverse N1,
> > so make this memcpy the default on N1 (on Centriq it is 15% faster than
> > memcpy_falkor).
> > 
> > Passes GLIBC regression tests. OK for commit?
> 
> As release manager this is OK for 2.32 if Szabolcs says it's OK.

thanks.
this is ok to commit.

> > ---
> > 
> > diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> > index 4377df0735287c210efd661188f9e6e3923c8003..e93c21e764a8d02b9f07f5030c31836a3f03f3e1 100644
> > --- a/sysdeps/aarch64/multiarch/Makefile
> > +++ b/sysdeps/aarch64/multiarch/Makefile
> > @@ -1,5 +1,5 @@
> >  ifeq ($(subdir),string)
> > -sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
> > +sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \
> >  		   memcpy_falkor \
> >  		   memcpy_new \
> >  		   memset_generic memset_falkor memset_emag memset_kunpeng \
> > diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> > index 0ccaf53e555e410569eb2be76ec7d5b4d7bc64a5..09feea97ea37ab923cf4a8557197d46adcd49204 100644
> > --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> > @@ -42,11 +42,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
> >  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
> >  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
> > +	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
> >  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
> >    IFUNC_IMPL (i, name, memmove,
> >  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
> >  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx2)
> >  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
> > +	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
> >  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
> >    IFUNC_IMPL (i, name, memset,
> >  	      /* Enable this on non-falkor processors too so that other cores
> > diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> > index 2fafefd5d23fc1528031b5fe52098218ed603b89..e6f3ae116701097d71a02e2a1f6bfdadc1eec34a 100644
> > --- a/sysdeps/aarch64/multiarch/memcpy.c
> > +++ b/sysdeps/aarch64/multiarch/memcpy.c
> > @@ -29,6 +29,7 @@
> >  extern __typeof (__redirect_memcpy) __libc_memcpy;
> >  
> >  extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
> > +extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
> >  extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
> >  extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
> >  extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
> > @@ -36,11 +37,11 @@ extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
> >  libc_ifunc (__libc_memcpy,
> >              (IS_THUNDERX (midr)
> >  	     ? __memcpy_thunderx
> > -	     : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_ARES (midr) || IS_KUNPENG920 (midr)
> > +	     : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_KUNPENG920 (midr)
> >  		? __memcpy_falkor
> >  		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
> >  		  ? __memcpy_thunderx2
> > -		  : __memcpy_generic))));
> > +		  : (IS_ARES (midr) ? __memcpy_simd : __memcpy_generic)))));
> >  
> >  # undef memcpy
> >  strong_alias (__libc_memcpy, memcpy);
> > diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..d4ba74777744c8bb5a83e43ab2d63ad8dab35203
> > --- /dev/null
> > +++ b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
> > @@ -0,0 +1,247 @@
> > +/* Generic optimized memcpy using SIMD.
> > +   Copyright (C) 2020 Free Software Foundation, Inc.
> > +
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <sysdep.h>
> > +
> > +/* Assumptions:
> > + *
> > + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> > + *
> > + */
> > +
> > +#define dstin	x0
> > +#define src	x1
> > +#define count	x2
> > +#define dst	x3
> > +#define srcend	x4
> > +#define dstend	x5
> > +#define A_l	x6
> > +#define A_lw	w6
> > +#define A_h	x7
> > +#define B_l	x8
> > +#define B_lw	w8
> > +#define B_h	x9
> > +#define C_lw	w10
> > +#define tmp1	x14
> > +
> > +#define A_q	q0
> > +#define B_q	q1
> > +#define C_q	q2
> > +#define D_q	q3
> > +#define E_q	q4
> > +#define F_q	q5
> > +#define G_q	q6
> > +#define H_q	q7
> > +
> > +
> > +/* This implementation supports both memcpy and memmove and shares most code.
> > +   It uses unaligned accesses and branchless sequences to keep the code small,
> > +   simple and improve performance.
> > +
> > +   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> > +   copies of up to 128 bytes, and large copies.  The overhead of the overlap
> > +   check in memmove is negligible since it is only required for large copies.
> > +
> > +   Large copies use a software pipelined loop processing 64 bytes per
> > +   iteration.  The destination pointer is 16-byte aligned to minimize
> > +   unaligned accesses.  The loop tail is handled by always copying 64 bytes
> > +   from the end.  */
> > +
> > +ENTRY (__memcpy_simd)
> > +	DELOUSE (0)
> > +	DELOUSE (1)
> > +	DELOUSE (2)
> > +
> > +	add	srcend, src, count
> > +	add	dstend, dstin, count
> > +	cmp	count, 128
> > +	b.hi	L(copy_long)
> > +	cmp	count, 32
> > +	b.hi	L(copy32_128)
> > +
> > +	/* Small copies: 0..32 bytes.  */
> > +	cmp	count, 16
> > +	b.lo	L(copy16)
> > +	ldr	A_q, [src]
> > +	ldr	B_q, [srcend, -16]
> > +	str	A_q, [dstin]
> > +	str	B_q, [dstend, -16]
> > +	ret
> > +
> > +	/* Copy 8-15 bytes.  */
> > +L(copy16):
> > +	tbz	count, 3, L(copy8)
> > +	ldr	A_l, [src]
> > +	ldr	A_h, [srcend, -8]
> > +	str	A_l, [dstin]
> > +	str	A_h, [dstend, -8]
> > +	ret
> > +
> > +	/* Copy 4-7 bytes.  */
> > +L(copy8):
> > +	tbz	count, 2, L(copy4)
> > +	ldr	A_lw, [src]
> > +	ldr	B_lw, [srcend, -4]
> > +	str	A_lw, [dstin]
> > +	str	B_lw, [dstend, -4]
> > +	ret
> > +
> > +	/* Copy 0..3 bytes using a branchless sequence.  */
> > +L(copy4):
> > +	cbz	count, L(copy0)
> > +	lsr	tmp1, count, 1
> > +	ldrb	A_lw, [src]
> > +	ldrb	C_lw, [srcend, -1]
> > +	ldrb	B_lw, [src, tmp1]
> > +	strb	A_lw, [dstin]
> > +	strb	B_lw, [dstin, tmp1]
> > +	strb	C_lw, [dstend, -1]
> > +L(copy0):
> > +	ret
> > +
> > +	.p2align 4
> > +	/* Medium copies: 33..128 bytes.  */
> > +L(copy32_128):
> > +	ldp	A_q, B_q, [src]
> > +	ldp	C_q, D_q, [srcend, -32]
> > +	cmp	count, 64
> > +	b.hi	L(copy128)
> > +	stp	A_q, B_q, [dstin]
> > +	stp	C_q, D_q, [dstend, -32]
> > +	ret
> > +
> > +	.p2align 4
> > +	/* Copy 65..128 bytes.  */
> > +L(copy128):
> > +	ldp	E_q, F_q, [src, 32]
> > +	cmp	count, 96
> > +	b.ls	L(copy96)
> > +	ldp	G_q, H_q, [srcend, -64]
> > +	stp	G_q, H_q, [dstend, -64]
> > +L(copy96):
> > +	stp	A_q, B_q, [dstin]
> > +	stp	E_q, F_q, [dstin, 32]
> > +	stp	C_q, D_q, [dstend, -32]
> > +	ret
> > +
> > +	/* Align loop64 below to 16 bytes.  */
> > +	nop
> > +
> > +	/* Copy more than 128 bytes.  */
> > +L(copy_long):
> > +	/* Copy 16 bytes and then align src to 16-byte alignment.  */
> > +	ldr	D_q, [src]
> > +	and	tmp1, src, 15
> > +	bic	src, src, 15
> > +	sub	dst, dstin, tmp1
> > +	add	count, count, tmp1	/* Count is now 16 too large.  */
> > +	ldp	A_q, B_q, [src, 16]
> > +	str	D_q, [dstin]
> > +	ldp	C_q, D_q, [src, 48]
> > +	subs	count, count, 128 + 16	/* Test and readjust count.  */
> > +	b.ls	L(copy64_from_end)
> > +L(loop64):
> > +	stp	A_q, B_q, [dst, 16]
> > +	ldp	A_q, B_q, [src, 80]
> > +	stp	C_q, D_q, [dst, 48]
> > +	ldp	C_q, D_q, [src, 112]
> > +	add	src, src, 64
> > +	add	dst, dst, 64
> > +	subs	count, count, 64
> > +	b.hi	L(loop64)
> > +
> > +	/* Write the last iteration and copy 64 bytes from the end.  */
> > +L(copy64_from_end):
> > +	ldp	E_q, F_q, [srcend, -64]
> > +	stp	A_q, B_q, [dst, 16]
> > +	ldp	A_q, B_q, [srcend, -32]
> > +	stp	C_q, D_q, [dst, 48]
> > +	stp	E_q, F_q, [dstend, -64]
> > +	stp	A_q, B_q, [dstend, -32]
> > +	ret
> > +
> > +END (__memcpy_simd)
> > +libc_hidden_builtin_def (__memcpy_simd)
> > +
> > +
> > +ENTRY (__memmove_simd)
> > +	DELOUSE (0)
> > +	DELOUSE (1)
> > +	DELOUSE (2)
> > +
> > +	add	srcend, src, count
> > +	add	dstend, dstin, count
> > +	cmp	count, 128
> > +	b.hi	L(move_long)
> > +	cmp	count, 32
> > +	b.hi	L(copy32_128)
> > +
> > +	/* Small moves: 0..32 bytes.  */
> > +	cmp	count, 16
> > +	b.lo	L(copy16)
> > +	ldr	A_q, [src]
> > +	ldr	B_q, [srcend, -16]
> > +	str	A_q, [dstin]
> > +	str	B_q, [dstend, -16]
> > +	ret
> > +
> > +L(move_long):
> > +	/* Only use backward copy if there is an overlap.  */
> > +	sub	tmp1, dstin, src
> > +	cbz	tmp1, L(move0)
> > +	cmp	tmp1, count
> > +	b.hs	L(copy_long)
> > +
> > +	/* Large backwards copy for overlapping copies.
> > +	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
> > +L(copy_long_backwards):
> > +	ldr	D_q, [srcend, -16]
> > +	and	tmp1, srcend, 15
> > +	bic	srcend, srcend, 15
> > +	sub	count, count, tmp1
> > +	ldp	A_q, B_q, [srcend, -32]
> > +	str	D_q, [dstend, -16]
> > +	ldp	C_q, D_q, [srcend, -64]
> > +	sub	dstend, dstend, tmp1
> > +	subs	count, count, 128
> > +	b.ls	L(copy64_from_start)
> > +
> > +L(loop64_backwards):
> > +	stp	A_q, B_q, [dstend, -32]
> > +	ldp	A_q, B_q, [srcend, -96]
> > +	stp	C_q, D_q, [dstend, -64]
> > +	ldp	C_q, D_q, [srcend, -128]
> > +	sub	srcend, srcend, 64
> > +	sub	dstend, dstend, 64
> > +	subs	count, count, 64
> > +	b.hi	L(loop64_backwards)
> > +
> > +	/* Write the last iteration and copy 64 bytes from the start.  */
> > +L(copy64_from_start):
> > +	ldp	E_q, F_q, [src, 32]
> > +	stp	A_q, B_q, [dstend, -32]
> > +	ldp	A_q, B_q, [src]
> > +	stp	C_q, D_q, [dstend, -64]
> > +	stp	E_q, F_q, [dstin, 32]
> > +	stp	A_q, B_q, [dstin]
> > +L(move0):
> > +	ret
> > +
> > +END (__memmove_simd)
> > +libc_hidden_builtin_def (__memmove_simd)
> > diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
> > index ed5a47f6f83e7b0afcec60cb9fa0f09999eaacae..1229f8b89296eddd2e711490bb7fc0b35726b6f5 100644
> > --- a/sysdeps/aarch64/multiarch/memmove.c
> > +++ b/sysdeps/aarch64/multiarch/memmove.c
> > @@ -29,6 +29,7 @@
> >  extern __typeof (__redirect_memmove) __libc_memmove;
> >  
> >  extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
> > +extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
> >  extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
> >  extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
> >  extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
> > @@ -40,7 +41,7 @@ libc_ifunc (__libc_memmove,
> >  		? __memmove_falkor
> >  		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
> >  		  ? __memmove_thunderx2
> > -		  : __memmove_generic))));
> > +		  : (IS_ARES (midr) ? __memmove_simd : __memmove_generic)))));
> >  
> >  # undef memmove
> >  strong_alias (__libc_memmove, memmove);
> > 
> > 
> 
> 
> -- 
> Cheers,
> Carlos.
>
  

Patch

diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 4377df0735287c210efd661188f9e6e3923c8003..e93c21e764a8d02b9f07f5030c31836a3f03f3e1 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,5 +1,5 @@ 
 ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
+sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \
 		   memcpy_falkor \
 		   memcpy_new \
 		   memset_generic memset_falkor memset_emag memset_kunpeng \
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 0ccaf53e555e410569eb2be76ec7d5b4d7bc64a5..09feea97ea37ab923cf4a8557197d46adcd49204 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -42,11 +42,13 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
   IFUNC_IMPL (i, name, memmove,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx2)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
   IFUNC_IMPL (i, name, memset,
 	      /* Enable this on non-falkor processors too so that other cores
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 2fafefd5d23fc1528031b5fe52098218ed603b89..e6f3ae116701097d71a02e2a1f6bfdadc1eec34a 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -29,6 +29,7 @@ 
 extern __typeof (__redirect_memcpy) __libc_memcpy;
 
 extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
@@ -36,11 +37,11 @@  extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
 libc_ifunc (__libc_memcpy,
             (IS_THUNDERX (midr)
 	     ? __memcpy_thunderx
-	     : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_ARES (midr) || IS_KUNPENG920 (midr)
+	     : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_KUNPENG920 (midr)
 		? __memcpy_falkor
 		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
 		  ? __memcpy_thunderx2
-		  : __memcpy_generic))));
+		  : (IS_ARES (midr) ? __memcpy_simd : __memcpy_generic)))));
 
 # undef memcpy
 strong_alias (__libc_memcpy, memcpy);
diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
new file mode 100644
index 0000000000000000000000000000000000000000..d4ba74777744c8bb5a83e43ab2d63ad8dab35203
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
@@ -0,0 +1,247 @@ 
+/* Generic optimized memcpy using SIMD.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_lw	w10
+#define tmp1	x14
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
+
+/* This implementation supports both memcpy and memmove and shares most code.
+   It uses unaligned accesses and branchless sequences to keep the code small,
+   simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check in memmove is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per
+   iteration.  The destination pointer is 16-byte aligned to minimize
+   unaligned accesses.  The loop tail is handled by always copying 64 bytes
+   from the end.  */
+
+ENTRY (__memcpy_simd)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
+	ret
+
+	/* Copy 8-15 bytes.  */
+L(copy16):
+	tbz	count, 3, L(copy8)
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+
+	/* Copy 4-7 bytes.  */
+L(copy8):
+	tbz	count, 2, L(copy4)
+	ldr	A_lw, [src]
+	ldr	B_lw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	B_lw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	C_lw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
+
+	.p2align 4
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_q, F_q, [src, 32]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
+L(copy96):
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Align loop64 below to 16 bytes.  */
+	nop
+
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(copy64_from_end)
+L(loop64):
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
+	ret
+
+END (__memcpy_simd)
+libc_hidden_builtin_def (__memcpy_simd)
+
+
+ENTRY (__memmove_simd)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	L(move_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	/* Small moves: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
+	ret
+
+L(move_long):
+	/* Only use backward copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cbz	tmp1, L(move0)
+	cmp	tmp1, count
+	b.hs	L(copy_long)
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
+	sub	count, count, tmp1
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	stp	C_q, D_q, [dstend, -64]
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
+	sub	dstend, dstend, 64
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
+L(move0):
+	ret
+
+END (__memmove_simd)
+libc_hidden_builtin_def (__memmove_simd)
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index ed5a47f6f83e7b0afcec60cb9fa0f09999eaacae..1229f8b89296eddd2e711490bb7fc0b35726b6f5 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -29,6 +29,7 @@ 
 extern __typeof (__redirect_memmove) __libc_memmove;
 
 extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
@@ -40,7 +41,7 @@  libc_ifunc (__libc_memmove,
 		? __memmove_falkor
 		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
 		  ? __memmove_thunderx2
-		  : __memmove_generic))));
+		  : (IS_ARES (midr) ? __memmove_simd : __memmove_generic)))));
 
 # undef memmove
 strong_alias (__libc_memmove, memmove);