From patchwork Tue Oct  2 15:42:42 2018
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
X-Patchwork-Id: 29620
Received: (qmail 14064 invoked by alias); 2 Oct 2018 15:42:57 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 14028 invoked by uid 89); 2 Oct 2018 15:42:54 -0000
Authentication-Results: sourceware.org; auth=none
X-Spam-SWARE-Status: No, score=-27.1 required=5.0 tests=BAYES_00, GIT_PATCH_0,
	GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_NUMSUBJECT,
	RCVD_IN_DNSWL_LOW,
	SPF_PASS autolearn=ham version=3.3.2 spammy=Performance
X-HELO: forward100o.mail.yandex.net
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=bell-sw.com; s=mail;
	t=1538494965;
	bh=M6VPq5mcUHDyaMm67MIJQkflYMQxVVGzLOcqqL86FpY=;
	h=Date:From:To:Cc:Subject:Message-ID:References:In-Reply-To;
	b=B35bBvKFqB7ULuq0FMauPNsvME2H6bWdYT0zQOH2xZW7QaXKmKOXWRzpDIXhHeIks
	z0Fz2Gc2LTOnpszzJbwYJjcYkMT0OSZYfu9YtTYdJ6VPbz+L6cv1tq3n645YjMbBsa
	YJPGWS15NKlFdTpz5W1jqZNreS7+LpHO531SvtNI=
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=bell-sw.com; s=mail;
	t=1538494963;
	bh=M6VPq5mcUHDyaMm67MIJQkflYMQxVVGzLOcqqL86FpY=;
	h=Date:From:To:Cc:Subject:Message-ID:References:In-Reply-To;
	b=r7tfcXO15Xb3AWHupSyNqTB/LzRtX65a2E+k/9oKIWc35wZ3s/CWH6BqdkRVzMJPq
	SbqDu2AnlQ+5LPszgjUnmOSk+BMAwxkG455LT3AzANQL3J+DGiSeo6PDPy0R9F04ca
	8rmgcJAJ/5G0fuyoWedG0tVJGOLJtIVAHSug1MuA=
Authentication-Results: smtp4o.mail.yandex.net;
	dkim=pass header.i=@bell-sw.com
Date: Tue, 2 Oct 2018 18:42:42 +0300
From: Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
To: Steve Ellcey <sellcey@cavium.com>
Cc: Siddhesh Poyarekar <siddhesh@gotplt.org>,
	"libc-alpha@sourceware.org" <libc-alpha@sourceware.org>
Subject: Re: [PATCH] aarch64: optimized memcpy implementation for thunderx2
Message-ID: <20181002154242.GA27595@bell-sw.com>
References: <2063a582-d65f-9e9f-50f5-80e4502edbd8@gotplt.org>
	<1538408223.18948.85.camel@cavium.com>
	<0899c6de-9462-8cca-5283-adc263d4b650@gotplt.org>
	<20181001162219.GA8242@bell-sw.com>
	<1538433762.18948.108.camel@cavium.com>
MIME-Version: 1.0
Content-Disposition: inline
In-Reply-To: <1538433762.18948.108.camel@cavium.com>
User-Agent: Mutt/1.5.24 (2015-08-30)

Steve,

Below is the patch updated per your suggestions along
with the summary of the performance benchmarks results.

On Mon, Oct 01, 2018 at 10:42:44PM +0000, Steve Ellcey wrote:
> On Mon, 2018-10-01 at 19:22 +0300, Anton Youdkevitch wrote:
> >??
> > +L(dst_unaligned):
> > +??????????????/* For the unaligned store case the code loads two
> > +????????????????????aligned chunks and then merges them using ext
> > +????????????????????instrunction. This can be up to 30% faster than
> > +????????????????????the the simple unaligned store access.
> > +
> > +????????????????????Current state: tmp1 = dst % 16; C_q, D_q, E_q
> > +????????????????????contains data yet to be stored. src and dst points
> > +????????????????????to next-to-be-processed data. A_q, B_q contains
> > +????????????????????data already stored before, count = bytes left to
> > +????????????????????be load decremented by 64.
> > +
> > +????????????????????The control is passed here if at least 64 bytes left
> > +????????????????????to be loaded. The code does two aligned loads and then
> > +????????????????????extracts (16-tmp1) bytes from the first register and
> > +????????????????????tmp1 bytes from the next register forming the value
> > +????????????????????for the aligned store.
> > +
> > +????????????????????As ext instruction can only have it's index encoded
> > +????????????????????as immediate. 15 code chunks process each possible
> > +????????????????????index value. Computed goto is used to reach the
> > +????????????????????required code. */
> > +
> > +??????????????/* Store the 16 bytes to dst and align dst for further
> > +????????????????????operations, several bytes will be stored at this
> > +????????????????????address once more */
> > +??????????????str??????????C_q, [dst], #16
> > +??????????????ldp??????????F_q, G_q, [src], #32
> > +??????????????bic??????????dst, dst, 15
> > +??????????????adr??????????tmp2, L(load_and_merge)
> > +??????????????add??????????tmp2, tmp2, tmp1, LSL 7
> > +??????????????sub??????????tmp2, tmp2, 128
> > +??????????????br????????????tmp2
> 
> Anton,
> 
> As far as the actual code, I think my only concern is this use of a
> 'computed goto' to jump to one of the extract sections. ??It seems very
> brittle since a change in the alignment of the various sections or a
> change in the size of those sections could mess up this jump. ??Would
> the code be any slower if you used a jump table instead of a computed
> goto?
> 
> The rest of my comments are just some minor suggestions to improve the
> comments or fix some typos.
> 
> 
> +/* Copies are split into 3 main cases: small copies of up to 16 bytes,
> > +??????medium copies of 17..96 bytes which are fully unrolled. Large copies
> > +??????of more than 96 bytes align the destination and use an unrolled loop
> > +??????processing 64 bytes per iteration.
> > +??????The current optimized memcpy implementation is not compatible with
> > +??????memmove and is separated from it completely. See below.
> > +??????Overlapping large forward memmoves use a loop that copies backwards.
> > +*/
> 
> Since this comment is in front of memmove (which is now completely
> separate from memcpy), it should probably just talk about memmove and
> then you can have a separate comment in front of memcpy which may
> duplicate some of this explanation. ??Including the line about memcpy
> being incompatible with memmove is still appropriate. ??So instead of
> 'Copies are split' have 'Moves are split'. etc.
> 
> +/* memcpy implementation below is not compatible with memmove
> > +??????because of pipelined loads/stores, which are faster, but they
> > +??????can't be used in the case of overlapping memmove arrays */
> 
> Expand this by copying some of the text from memmove but still??
> include the text about why it is not compatible with memmove.
> 
> > +??????????????/* the range of count being [65..96] becomes [65..111]
> > +????????????????????after tmp [0..15] gets added to it,
> > +????????????????????count now is <bytes-left-to-load>+48 */
> 
> Start with uppercase 'T' ??(The range vs. the range).
> 
> +L(dst_unaligned):
> > +??????????????/* For the unaligned store case the code loads two
> > +????????????????????aligned chunks and then merges them using ext
> > +????????????????????instrunction. This can be up to 30% faster than
> 
> instrunction -> instruction
> 
> Steve Ellcey

* sysdeps/aarch64/multiarch/memcpy_thunderx2.S: rewritten
  implementation considering thunderX2 chip specifics

  Performance gain against the current T2 implementation:
    memcpy-large: 65K-32M: +40% - +10%
    memcpy-walk:  128-32M: +20% - +2%

diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
index de494d9..6000365 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
@@ -74,13 +74,10 @@
 
 #if IS_IN (libc)
 
-# ifndef USE_THUNDERX2
 #  undef MEMCPY
 #  define MEMCPY __memcpy_thunderx
 #  undef MEMMOVE
 #  define MEMMOVE __memmove_thunderx
-#  define USE_THUNDERX
-# endif
 
 ENTRY_ALIGN (MEMMOVE, 6)
 
@@ -182,8 +179,6 @@ L(copy96):
 	.p2align 4
 L(copy_long):
 
-# if defined(USE_THUNDERX) || defined (USE_THUNDERX2)
-
 	/* On thunderx, large memcpy's are helped by software prefetching.
 	   This loop is identical to the one below it but with prefetching
 	   instructions included.  For loops that are less than 32768 bytes,
@@ -196,11 +191,7 @@ L(copy_long):
 	bic	dst, dstin, 15
 	ldp	D_l, D_h, [src]
 	sub	src, src, tmp1
-#  if defined(USE_THUNDERX)
 	prfm	pldl1strm, [src, 384]
-#  elif defined(USE_THUNDERX2)
-	prfm	pldl1strm, [src, 256]
-#  endif
 	add	count, count, tmp1	/* Count is now 16 too large.  */
 	ldp	A_l, A_h, [src, 16]
 	stp	D_l, D_h, [dstin]
@@ -210,13 +201,9 @@ L(copy_long):
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
 
 L(prefetch_loop64):
-#  if defined(USE_THUNDERX)
 	tbz	src, #6, 1f
 	prfm	pldl1strm, [src, 512]
 1:
-#  elif defined(USE_THUNDERX2)
-	prfm	pldl1strm, [src, 256]
-#  endif
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [src, 16]
 	stp	B_l, B_h, [dst, 32]
@@ -230,7 +217,6 @@ L(prefetch_loop64):
 	b	L(last64)
 
 L(copy_long_without_prefetch):
-# endif
 
 	and	tmp1, dstin, 15
 	bic	dst, dstin, 15
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index 8501abf..b2369ab 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -20,8 +20,1035 @@
 /* The actual code in this memcpy and memmove is in memcpy_thunderx.S.
    The only real differences are with the prefetching instructions.  */
 
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define tmp2    x6
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define A_hw	w7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	src
+#define E_h	count
+#define F_l	srcend
+#define F_h	dst
+#define G_l	count
+#define G_h	dst
+#define tmp1	x14
+
+#define A_q     q0
+#define B_q     q1
+#define C_q     q2
+#define D_q     q3
+#define E_q     q4
+#define F_q     q5
+#define G_q     q6
+#define H_q	q7
+#define I_q	q16
+#define J_q	q17
+
+#define A_v     v0
+#define B_v     v1
+#define C_v     v2
+#define D_v     v3
+#define E_v     v4
+#define F_v     v5
+#define G_v     v6
+#define H_v     v7
+#define I_v     v16
+#define J_v	v17
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#if IS_IN (libc)
+
+#undef MEMCPY
+#undef MEMMOVE
 #define MEMCPY __memcpy_thunderx2
 #define MEMMOVE __memmove_thunderx2
-#define USE_THUNDERX2
 
-#include "memcpy_thunderx.S"
+
+/* Moves are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   Overlapping large forward memmoves use a loop that copies backwards.
+*/
+
+ENTRY_ALIGN (MEMMOVE, 6)
+
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	sub	tmp1, dstin, src
+	cmp	count, 96
+	ccmp	tmp1, count, 2, hi
+	b.lo	L(move_long)
+
+	prfm	PLDL1KEEP, [src]
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 16
+	b.ls	L(copy16)
+	cmp	count, 96
+	b.hi	L(copy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldp	A_l, A_h, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+1:
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	cmp	count, 8
+	b.lo	1f
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+	.p2align 4
+1:
+	tbz	count, 2, 1f
+	ldr	A_lw, [src]
+	ldr	A_hw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	A_hw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+1:
+	cbz	count, 2f
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
+2:	ret
+
+	.p2align 4
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [src, 32]
+	ldp	D_l, D_h, [src, 48]
+	ldp	E_l, E_h, [srcend, -32]
+	ldp	F_l, F_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin, 32]
+	stp	D_l, D_h, [dstin, 48]
+	stp	E_l, E_h, [dstend, -32]
+	stp	F_l, F_h, [dstend, -16]
+	ret
+
+	/* Align DST to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 4
+L(copy_long):
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(last64)
+L(loop64):
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
+L(last64):
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	.p2align 4
+L(move_long):
+	cbz	tmp1, 3f
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+
+	/* Align dstend to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	and	tmp1, dstend, 15
+	ldp	D_l, D_h, [srcend, -16]
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	2f
+
+	nop
+1:
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the start even if
+	   there is just 1 byte left.  */
+2:
+	ldp	G_l, G_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	G_l, G_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+3:	ret
+
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use load-and-merge
+   approach in the case src and dst addresses are unaligned not evenly,
+   so that, loads and stores are always aligned.
+   Large copies use an unrolled loop processing 64 bytes per iteration.
+   The current optimized memcpy implementation is not compatible with
+   memmove and is separated from it completely.
+
+   memcpy implementation below is not compatible with memmove
+   because of pipelined loads/stores, which are faster, but they
+   can't be used in the case of overlapping memmove arrays */
+
+#define MEMCPY_PREFETCH_LDR 640
+
+ENTRY (MEMCPY)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	add     srcend, src, count
+	cmp     count, 16
+	b.ls    L(memcopy16)
+	ldr     A_q, [src], #16
+	add     dstend, dstin, count
+	and     tmp1, src, 15
+	cmp     count, 96
+	b.hi    L(memcopy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	ldr     E_q, [srcend, -16]
+	cmp     count, 64
+	b.gt    L(memcpy_copy96)
+	cmp     count, 48
+	b.le    L(bytes_17_to_48)
+	/* 49..64 bytes */
+	ldp     B_q, C_q, [src]
+	str     E_q, [dstend, -16]
+	stp     A_q, B_q, [dstin]
+	str     C_q, [dstin, 32]
+	ret
+
+L(bytes_17_to_48):
+	/* 17..48 bytes*/
+	cmp     count, 32
+	b.gt    L(bytes_32_to_48)
+	/* 17..32 bytes*/
+	str     A_q, [dstin]
+	str     E_q, [dstend, -16]
+	ret
+
+L(bytes_32_to_48):
+	/* 32..48 */
+	ldr     B_q, [src]
+	str     A_q, [dstin]
+	str     E_q, [dstend, -16]
+	str     B_q, [dstin, 16]
+	ret
+
+	.p2align 4
+	/* Small copies: 0..16 bytes.  */
+L(memcopy16):
+	cmp     count, 8
+	b.lo    L(bytes_0_to_8)
+	ldr     A_l, [src]
+	ldr     A_h, [srcend, -8]
+	add     dstend, dstin, count
+	str     A_l, [dstin]
+	str     A_h, [dstend, -8]
+	ret
+	.p2align 4
+
+L(bytes_0_to_8):
+	tbz     count, 2, L(bytes_0_to_3)
+	ldr     A_lw, [src]
+	ldr     A_hw, [srcend, -4]
+	add     dstend, dstin, count
+	str     A_lw, [dstin]
+	str     A_hw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+L(bytes_0_to_3):
+	cbz     count, L(end)
+	lsr     tmp1, count, 1
+	ldrb    A_lw, [src]
+	ldrb    A_hw, [srcend, -1]
+	add     dstend, dstin, count
+	ldrb    B_lw, [src, tmp1]
+	strb    A_lw, [dstin]
+	strb    B_lw, [dstin, tmp1]
+	strb    A_hw, [dstend, -1]
+L(end): ret
+
+	.p2align 4
+
+L(memcpy_copy96):
+	/* Copying 65..96 bytes. A_q (first 16 bytes) and
+	   E_q(last 16 bytes) are already loaded.
+
+	   The size is large enough to benefit from aligned
+	   loads */
+	bic     src, src, 15
+	ldp     B_q, C_q, [src]
+	str     A_q, [dstin]
+	/* Loaded 64 bytes, second 16-bytes chunk can be
+	   overlapping with the first chunk by tmp1 bytes.
+	   Stored 16 bytes. */
+	sub     dst, dstin, tmp1
+	add     count, count, tmp1
+	/* The range of count being [65..96] becomes [65..111]
+	   after tmp [0..15] gets added to it,
+	   count now is <bytes-left-to-load>+48 */
+	cmp     count, 80
+	b.gt    L(copy96_medium)
+	ldr     D_q, [src, 32]
+	stp     B_q, C_q, [dst, 16]
+	str     E_q, [dstend, -16]
+	str     D_q, [dst, 48]
+	ret
+
+	.p2align 4
+L(copy96_medium):
+	ldp     D_q, A_q, [src, 32]
+	str     B_q, [dst, 16]
+	cmp     count, 96
+	b.gt    L(copy96_large)
+	str     E_q, [dstend, -16]
+	stp     C_q, D_q, [dst, 32]
+	str     A_q, [dst, 64]
+	ret
+
+L(copy96_large):
+	ldr     F_q, [src, 64]
+	stp     C_q, D_q, [dst, 32]
+	str     E_q, [dstend, -16]
+	stp     A_q, F_q, [dst, 64]
+	ret
+
+	.p2align 4
+L(memcopy_long):
+	bic     src, src, 15
+	ldp     B_q, C_q, [src], #32
+	str     A_q, [dstin]
+	sub     dst, dstin, tmp1
+	add     count, count, tmp1
+	add     dst, dst, 16
+	and	tmp1, dst, 15
+	ldp     D_q, E_q, [src], #32
+	str     B_q, [dst], #16
+
+	/* Already loaded 64+16 bytes. Check if at
+	   least 64 more bytes left */
+	subs    count, count, 64+64+16
+	b.lt    L(loop128_exit2)
+	cmp     count, MEMCPY_PREFETCH_LDR + 64 + 32
+	b.lt    L(loop128)
+	cbnz	tmp1, L(dst_unaligned)
+	sub     count, count, MEMCPY_PREFETCH_LDR + 64 + 32
+
+	.p2align 4
+
+L(loop128_prefetch):
+	str     C_q, [dst], #16
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	str     D_q, [dst], #16
+	ldp     F_q, G_q, [src], #32
+	str	E_q, [dst], #16
+	ldp     H_q, A_q, [src], #32
+	str     F_q, [dst], #16
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	str     G_q, [dst], #16
+	ldp     B_q, C_q, [src], #32
+	str	H_q, [dst], #16
+	ldp     D_q, E_q, [src], #32
+	stp	A_q, B_q, [dst], #32
+	subs	count, count, 128
+	b.ge    L(loop128_prefetch)
+
+L(preloop128):
+	add	count, count, MEMCPY_PREFETCH_LDR + 64 + 32
+	.p2align 4
+L(loop128):
+	ldp     F_q, G_q, [src], #32
+	str     C_q, [dst], #16
+	ldp     B_q, A_q, [src], #32
+	str     D_q, [dst], #16
+	stp     E_q, F_q, [dst], #32
+	stp     G_q, B_q, [dst], #32
+	subs    count, count, 64
+	b.lt    L(loop128_exit1)
+L(loop128_proceed):
+	ldp     B_q, C_q, [src], #32
+	str     A_q, [dst], #16
+	ldp     D_q, E_q, [src], #32
+	str     B_q, [dst], #16
+	subs    count, count, 64
+	b.ge    L(loop128)
+
+	.p2align 4
+L(loop128_exit2):
+	stp     C_q, D_q, [dst], #32
+	str     E_q, [dst], #16
+	b       L(copy_long_check32);
+
+L(loop128_exit1):
+	/* A_q is still not stored and 0..63 bytes left,
+	   so, count is -64..-1.
+	   Check if less than 32 bytes left (count < -32) */
+	str     A_q, [dst], #16
+L(copy_long_check32):
+	cmn     count, 64
+	b.eq    L(copy_long_done)
+	cmn     count, 32
+	b.le    L(copy_long_last32)
+	ldp     B_q, C_q, [src]
+	stp     B_q, C_q, [dst]
+
+L(copy_long_last32):
+	ldp     F_q, G_q, [srcend, -32]
+	stp     F_q, G_q, [dstend, -32]
+
+L(copy_long_done):
+	ret
+
+L(dst_unaligned):
+	/* For the unaligned store case the code loads two
+	   aligned chunks and then merges them using ext
+	   instruction. This can be up to 30% faster than
+	   the the simple unaligned store access.
+
+	   Current state: tmp1 = dst % 16; C_q, D_q, E_q
+	   contains data yet to be stored. src and dst points
+	   to next-to-be-processed data. A_q, B_q contains
+	   data already stored before, count = bytes left to
+	   be load decremented by 64.
+
+	   The control is passed here if at least 64 bytes left
+	   to be loaded. The code does two aligned loads and then
+	   extracts (16-tmp1) bytes from the first register and
+	   tmp1 bytes from the next register forming the value
+	   for the aligned store.
+
+	   As ext instruction can only have it's index encoded
+	   as immediate. 15 code chunks process each possible
+	   index value. Computed goto is used to reach the
+	   required code. */
+	
+	/* Store the 16 bytes to dst and align dst for further
+	   operations, several bytes will be stored at this
+	   address once more */
+	str     C_q, [dst], #16
+	ldp     F_q, G_q, [src], #32
+	bic	dst, dst, 15
+	adr	tmp2, L(ext_table)
+	add	tmp2, tmp2, tmp1, LSL #2
+	ldr	tmp2, [tmp2]
+	br	tmp2
+
+.p2align 7
+L(load_and_merge):
+#define EXT_SIZE 1
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 2
+L(ext_size_2):
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+2:
+
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 3
+L(ext_size_3):
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 4
+L(ext_size_4):
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 5
+L(ext_size_5):
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 6
+L(ext_size_6):
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 7
+L(ext_size_7):
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 8
+L(ext_size_8):
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 9
+L(ext_size_9):
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 10
+L(ext_size_10):
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 11
+L(ext_size_11):
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 12
+L(ext_size_12):
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 13
+L(ext_size_13):
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 14
+L(ext_size_14):
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 15
+L(ext_size_15):
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+L(ext_table):
+	.int	4
+	.int	L(ext_size_2) - L(load_and_merge) + 4
+	.int	L(ext_size_3) - L(load_and_merge) + 4
+	.int	L(ext_size_4) - L(load_and_merge) + 4
+	.int	L(ext_size_5) - L(load_and_merge) + 4
+	.int	L(ext_size_6) - L(load_and_merge) + 4
+	.int	L(ext_size_7) - L(load_and_merge) + 4
+	.int	L(ext_size_8) - L(load_and_merge) + 4
+	.int	L(ext_size_9) - L(load_and_merge) + 4
+	.int	L(ext_size_10) - L(load_and_merge) + 4
+	.int	L(ext_size_11) - L(load_and_merge) + 4
+	.int	L(ext_size_12) - L(load_and_merge) + 4
+	.int	L(ext_size_13) - L(load_and_merge) + 4
+	.int	L(ext_size_14) - L(load_and_merge) + 4
+	.int	L(ext_size_15) - L(load_and_merge) + 4
+
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+#endif