From patchwork Tue Dec 15 16:40:44 2015
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
X-Patchwork-Id: 10019
Received: (qmail 47871 invoked by alias); 15 Dec 2015 16:40:54 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 47850 invoked by uid 89); 15 Dec 2015 16:40:54 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-1.5 required=5.0 tests=AWL, BAYES_00,
	SPF_PASS autolearn=ham version=3.3.2
X-HELO: eu-smtp-delivery-143.mimecast.com
From: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
To: 'GNU C Library' <libc-alpha@sourceware.org>
CC: nd <nd@arm.com>
Subject: Re: [PATCH][AArch64] Optimized memcpy/memmove
Date: Tue, 15 Dec 2015 16:40:44 +0000
Message-ID: 
 <AM3PR08MB00884C9733FDD45E7E75974783EE0@AM3PR08MB0088.eurprd08.prod.outlook.com>
x-microsoft-exchange-diagnostics: 1; AM3PR08MB0086;
	5:bbUxC/wOgMpTG7utMXwefSs2WSuDZABvytaMp/50oe2MGes+GfU2gCqLT5c6tJnbUPtNNy1uKqjwzADqqa0TpPnw93NSuQ0O216L2DJugmBkKMF06fNxI/gZCKD+wHBi99jF/mXuvjMMsH9h5CG/8Q==;
	24:JAhOvs/AqBtASoHPOBfo14LYJSdoll0wsUHNLkGhRhMkpl2GZqncaU6gzyAUVfh/juaCg40F1fb+Ts+rPe4InHosig5mM6t4AgYmvtzpLAg=;
	20:fV+ZRcgc+/Rs055qW4ZVYr5c0t0GoWr+s4FGpnGHSzUXkdpboffU9c224NkrQTEwXHy4goBgWnMnYtUts11FRSH0bwWWQzHI8tTtnKuRyOqm0nOKK4FlHDsDg3SszCV1BvDlPvsT0ybjr12gld9x/bqhjduqriByZO+EwvZh5WU=
x-microsoft-antispam: UriScan:;BCL:0;PCL:0;RULEID:;SRVR:AM3PR08MB0086;
nodisclaimer: True
x-microsoft-antispam-prvs: 
 <AM3PR08MB0086A2BDA820A78EB95D453583EE0@AM3PR08MB0086.eurprd08.prod.outlook.com>
x-exchange-antispam-report-test: UriScan:(180628864354917);
x-exchange-antispam-report-cfa-test: BCL:0; PCL:0;
	RULEID:(601004)(2401047)(5005006)(520078)(8121501046)(3002001)(10201501046);
	SRVR:AM3PR08MB0086; BCL:0; PCL:0; RULEID:; SRVR:AM3PR08MB0086;
x-forefront-prvs: 07915F544A
x-forefront-antispam-report: SFV:NSPM;
	SFS:(10009020)(6009001)(13464003)(54534003)(189002)(199003)(377424004)(586003)(81156007)(99936001)(1096002)(5001960100002)(6116002)(92566002)(5003600100002)(33656002)(189998001)(110136002)(19580395003)(5250100002)(3846002)(1220700001)(5002640100001)(102836003)(2900100001)(74316001)(5008740100001)(50986999)(5004730100002)(76576001)(105586002)(54356999)(19580405001)(66066001)(40100003)(97736004)(86362001)(101416001)(106356001)(450100001)(11100500001)(87936001);
	DIR:OUT; SFP:1101; SCL:1; SRVR:AM3PR08MB0086;
	H:AM3PR08MB0088.eurprd08.prod.outlook.com; FPR:; SPF:None;
	PTR:InfoNoRecords; MX:1; A:1; LANG:en;
spamdiagnosticoutput: 1:23
spamdiagnosticmetadata: NSPM
MIME-Version: 1.0
X-OriginatorOrg: arm.com
X-MS-Exchange-CrossTenant-originalarrivaltime: 15 Dec 2015 16:40:44.8113
	(UTC)
X-MS-Exchange-CrossTenant-fromentityheader: Hosted
X-MS-Exchange-CrossTenant-id: f34e5979-57d9-4aaa-ad4d-b122a662184d
X-MS-Exchange-Transport-CrossTenantHeadersStamped: AM3PR08MB0086
X-MC-Unique: o5P-J6xZRtOIP9z94R9tYQ-1

-----Original Message-----
From: Wilco Dijkstra [mailto:wdijkstr@arm.com] 
Sent: 25 September 2015 14:17
To: 'GNU C Library'
Subject: [PATCH][AArch64] Optimized memcpy/memmove

Further optimize memcpy/memmove for AArch64. Copies are split into 3 main cases: small copies of up to 16 bytes, medium copies of 17..96 bytes which are fully unrolled. Large copies of more than 96 bytes align the destination and use an unrolled loop processing 64 bytes per iteration. In order to share code with memmove, small and medium copies read all data before writing, allowing any kind of overlap. All memmoves except for the large backwards case fall into memcpy for optimal performance. On a random copy test memcpy/memmove are 40% faster on A57 and 28% on A53.

OK for commit?

ChangeLog:
2015-09-25  Wilco Dijkstra  <wdijkstr@arm.com>

	* sysdeps/aarch64/memcpy.S (memcpy):
	Rewrite of optimized memcpy and memmove.
	* sysdeps/aarch64/memmove.S (memmove): Remove
	memmove code (merged into memcpy.S).
---
 sysdeps/aarch64/memcpy.S  | 350 +++++++++++++++++++++++++++-------------------
 sysdeps/aarch64/memmove.S | 311 +---------------------------------------
 2 files changed, 210 insertions(+), 451 deletions(-)

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index b3d550e..51e7268 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -9,168 +9,236 @@
 
    The GNU C Library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
+#include <sysdep.h>
+
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses.
  *
  */
 
 #define dstin	x0
 #define src	x1
 #define count	x2
-#define tmp1	x3
-#define tmp1w	w3
-#define tmp2	x4
-#define tmp2w	w4
-#define tmp3	x5
-#define tmp3w	w5
-#define dst	x6
-
-#define A_l	x7
-#define A_h	x8
-#define B_l	x9
-#define B_h	x10
-#define C_l	x11
-#define C_h	x12
-#define D_l	x13
-#define D_h	x14
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define A_hw	w7
+#define B_l	x8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	src
+#define E_h	count
+#define F_l	srcend
+#define F_h	dst
+#define G_l	count
+#define G_h	dst
+#define tmp1	x14
 
-#include <sysdep.h>
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   In order to share code with memmove, small and medium copies read all
+   data before writing, allowing any kind of overlap. So small, medium
+   and large backwards memmoves are handled by falling through into memcpy.
+   Overlapping large forward memmoves use a loop that copies backwards.
+*/
 
-ENTRY_ALIGN (memcpy, 6)
-
-	mov	dst, dstin
-	cmp	count, #64
-	b.ge	L(cpy_not_short)
-	cmp	count, #15
-	b.le	L(tail15tiny)
-
-	/* Deal with small copies quickly by dropping straight into the
-	 * exit block.  */
-L(tail63):
-	/* Copy up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	L(tail15)
-	add	dst, dst, tmp1
-	add	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #-48]
-	stp	A_l, A_h, [dst, #-48]
-1:
-	ldp	A_l, A_h, [src, #-32]
-	stp	A_l, A_h, [dst, #-32]
-2:
-	ldp	A_l, A_h, [src, #-16]
-	stp	A_l, A_h, [dst, #-16]
-
-L(tail15):
-	ands	count, count, #15
-	beq	1f
-	add	src, src, count
-	ldp	A_l, A_h, [src, #-16]
-	add	dst, dst, count
-	stp	A_l, A_h, [dst, #-16]
-1:
-	RET
-
-L(tail15tiny):
-	/* Copy up to 15 bytes of data.  Does not assume additional data
-	   being copied.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
+ENTRY_ALIGN (memmove, 6)
+
+	sub	tmp1, dstin, src
+	cmp	count, 96
+	ccmp	tmp1, count, 2, hi
+	b.lo	L(move_long)
+
+	/* Common case falls through into memcpy.  */
+END (memmove)
+libc_hidden_builtin_def (memmove)
+ENTRY (memcpy)
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 96
+	b.hi	L(copy_long)
+	cmp	count, 16
+	b.hs	L(copy_medium)
+
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	tbz	count, 3, 1f
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
 1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
+	tbz	count, 2, 1f
+	ldr	A_lw, [src]
+	ldr	A_hw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	A_hw, [dstend, -4]
+	ret
+	.p2align 4
 1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
+	cbz	count, 2f
+	ldrb	A_lw, [src]
+	tbz	count, 1, 1f
+	ldrh	A_hw, [srcend, -2]
+	strh	A_hw, [dstend, -2]
+1:	strb	A_lw, [dstin]
+2:	ret
+
+	.p2align 4
+	/* Medium copies: 17..96 bytes.	 */
+L(copy_medium):
+	ldp	A_l, A_h, [src]
+	tbnz	count, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	count, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
 1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src]
-	strb	tmp1w, [dst]
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [src, 32]
+	ldp	D_l, D_h, [src, 48]
+	ldp	E_l, E_h, [srcend, -32]
+	ldp	F_l, F_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin, 32]
+	stp	D_l, D_h, [dstin, 48]
+	stp	E_l, E_h, [dstend, -32]
+	stp	F_l, F_h, [dstend, -16]
+	ret
+
+	/* Align DST to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.	 There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.	The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 4
+L(copy_long):
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	2f
 1:
-	RET
-
-L(cpy_not_short):
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	neg	tmp2, src
-	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Copy more data than needed; it's faster than jumping
-	 * around copying sub-Quadword quantities.  We know that
-	 * it can't overrun.  */
-	ldp	A_l, A_h, [src]
-	add	src, src, tmp2
-	stp	A_l, A_h, [dst]
-	add	dst, dst, tmp2
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	L(tail63)
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.	 The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
 2:
-	subs	count, count, #128
-	b.ge	L(cpy_body_large)
-	/* Less than 128 bytes to copy, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src]
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst]
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	add	src, src, #64
-	add	dst, dst, #64
-	b.ne	L(tail63)
-	RET
-
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-L(cpy_body_large):
-	/* There are at least 128 bytes to copy.  */
-	ldp	A_l, A_h, [src, #0]
-	sub	dst, dst, #16		/* Pre-bias.  */
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	.p2align 4
+L(move_long):
+	cbz	tmp1, 3f
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+
+	/* Align dstend to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.	 There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.	The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	and	tmp1, dstend, 15
+	ldp	D_l, D_h, [srcend, -16]
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	2f
+
+	nop
 1:
-	stp	A_l, A_h, [dst, #16]
-	ldp	A_l, A_h, [src, #16]
-	stp	B_l, B_h, [dst, #32]
-	ldp	B_l, B_h, [src, #32]
-	stp	C_l, C_h, [dst, #48]
-	ldp	C_l, C_h, [src, #48]
-	stp	D_l, D_h, [dst, #64]!
-	ldp	D_l, D_h, [src, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #16]
-	stp	B_l, B_h, [dst, #32]
-	stp	C_l, C_h, [dst, #48]
-	stp	D_l, D_h, [dst, #64]
-	add	src, src, #16
-	add	dst, dst, #64 + 16
-	tst	count, #0x3f
-	b.ne	L(tail63)
-	RET
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.	 The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the start even if
+	   there is just 1 byte left.  */
+2:
+	ldp	G_l, G_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	G_l, G_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+3:	ret
+
 END (memcpy)
 libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/aarch64/memmove.S b/sysdeps/aarch64/memmove.S
index 8d0b328..e531b14 100644
--- a/sysdeps/aarch64/memmove.S
+++ b/sysdeps/aarch64/memmove.S
@@ -1,312 +1,3 @@
-/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
 
-   This file is part of the GNU C Library.
+/* memmove is part of memcpy.S.  */
 
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64
- * Unaligned accesses
- */
-
-/* Parameters and result.  */
-#define dstin	x0
-#define src	x1
-#define count	x2
-#define tmp1	x3
-#define tmp1w	w3
-#define tmp2	x4
-#define tmp2w	w4
-#define tmp3	x5
-#define tmp3w	w5
-#define dst	x6
-
-#define A_l	x7
-#define A_h	x8
-#define B_l	x9
-#define B_h	x10
-#define C_l	x11
-#define C_h	x12
-#define D_l	x13
-#define D_h	x14
-
-ENTRY_ALIGN (memmove, 6)
-
-	cmp	dstin, src
-	b.lo	L(downwards)
-	add	tmp1, src, count
-	cmp	dstin, tmp1
-	b.hs	memcpy		/* No overlap.  */
-
-	/* Upwards move with potential overlap.
-	 * Need to move from the tail backwards.  SRC and DST point one
-	 * byte beyond the remaining data to move.  */
-	add	dst, dstin, count
-	add	src, src, count
-	cmp	count, #64
-	b.ge	L(mov_not_short_up)
-
-	/* Deal with small moves quickly by dropping straight into the
-	 * exit block.  */
-L(tail63up):
-	/* Move up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	L(tail15up)
-	sub	dst, dst, tmp1
-	sub	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #32]
-	stp	A_l, A_h, [dst, #32]
-1:
-	ldp	A_l, A_h, [src, #16]
-	stp	A_l, A_h, [dst, #16]
-2:
-	ldp	A_l, A_h, [src]
-	stp	A_l, A_h, [dst]
-L(tail15up):
-	/* Move up to 15 bytes of data.  Does not assume additional data
-	 * being moved.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src, #-8]!
-	str	tmp1, [dst, #-8]!
-1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src, #-4]!
-	str	tmp1w, [dst, #-4]!
-1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src, #-2]!
-	strh	tmp1w, [dst, #-2]!
-1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src, #-1]
-	strb	tmp1w, [dst, #-1]
-1:
-	RET
-
-L(mov_not_short_up):
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Move enough data to reach alignment; unlike memcpy, we have to
-	 * be aware of the overlap, which means we can't move data twice.  */
-	tbz	tmp2, #3, 1f
-	ldr	tmp1, [src, #-8]!
-	str	tmp1, [dst, #-8]!
-1:
-	tbz	tmp2, #2, 1f
-	ldr	tmp1w, [src, #-4]!
-	str	tmp1w, [dst, #-4]!
-1:
-	tbz	tmp2, #1, 1f
-	ldrh	tmp1w, [src, #-2]!
-	strh	tmp1w, [dst, #-2]!
-1:
-	tbz	tmp2, #0, 1f
-	ldrb	tmp1w, [src, #-1]!
-	strb	tmp1w, [dst, #-1]!
-1:
-
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	L(tail63up)
-2:
-	subs	count, count, #128
-	b.ge	L(mov_body_large_up)
-	/* Less than 128 bytes to move, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src, #-64]!
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst, #-64]!
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	b.ne	L(tail63up)
-	RET
-
-	/* Critical loop.  Start at a new Icache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-L(mov_body_large_up):
-	/* There are at least 128 bytes to move.  */
-	ldp	A_l, A_h, [src, #-16]
-	ldp	B_l, B_h, [src, #-32]
-	ldp	C_l, C_h, [src, #-48]
-	ldp	D_l, D_h, [src, #-64]!
-1:
-	stp	A_l, A_h, [dst, #-16]
-	ldp	A_l, A_h, [src, #-16]
-	stp	B_l, B_h, [dst, #-32]
-	ldp	B_l, B_h, [src, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	ldp	C_l, C_h, [src, #-48]
-	stp	D_l, D_h, [dst, #-64]!
-	ldp	D_l, D_h, [src, #-64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #-16]
-	stp	B_l, B_h, [dst, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	stp	D_l, D_h, [dst, #-64]!
-	tst	count, #0x3f
-	b.ne	L(tail63up)
-	RET
-
-L(downwards):
-	/* For a downwards move we can safely use memcpy provided that
-	 * DST is more than 16 bytes away from SRC.  */
-	sub	tmp1, src, #16
-	cmp	dstin, tmp1
-	b.ls	memcpy		/* May overlap, but not critically.  */
-
-	mov	dst, dstin	/* Preserve DSTIN for return value.  */
-	cmp	count, #64
-	b.ge	L(mov_not_short_down)
-
-	/* Deal with small moves quickly by dropping straight into the
-	 * exit block.  */
-L(tail63down):
-	/* Move up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	L(tail15down)
-	add	dst, dst, tmp1
-	add	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #-48]
-	stp	A_l, A_h, [dst, #-48]
-1:
-	ldp	A_l, A_h, [src, #-32]
-	stp	A_l, A_h, [dst, #-32]
-2:
-	ldp	A_l, A_h, [src, #-16]
-	stp	A_l, A_h, [dst, #-16]
-L(tail15down):
-	/* Move up to 15 bytes of data.  Does not assume additional data
-	   being moved.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
-1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
-1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
-1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src]
-	strb	tmp1w, [dst]
-1:
-	RET
-
-L(mov_not_short_down):
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	neg	tmp2, src
-	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Move enough data to reach alignment; unlike memcpy, we have to
-	 * be aware of the overlap, which means we can't move data twice.  */
-	tbz	tmp2, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
-1:
-	tbz	tmp2, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
-1:
-	tbz	tmp2, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
-1:
-	tbz	tmp2, #0, 1f
-	ldrb	tmp1w, [src], #1
-	strb	tmp1w, [dst], #1
-1:
-
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	L(tail63down)
-2:
-	subs	count, count, #128
-	b.ge	L(mov_body_large_down)
-	/* Less than 128 bytes to move, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src]
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst]
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	add	src, src, #64
-	add	dst, dst, #64
-	b.ne	L(tail63down)
-	RET
-
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-L(mov_body_large_down):
-	/* There are at least 128 bytes to move.  */
-	ldp	A_l, A_h, [src, #0]
-	sub	dst, dst, #16		/* Pre-bias.  */
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
-1:
-	stp	A_l, A_h, [dst, #16]
-	ldp	A_l, A_h, [src, #16]
-	stp	B_l, B_h, [dst, #32]
-	ldp	B_l, B_h, [src, #32]
-	stp	C_l, C_h, [dst, #48]
-	ldp	C_l, C_h, [src, #48]
-	stp	D_l, D_h, [dst, #64]!
-	ldp	D_l, D_h, [src, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #16]
-	stp	B_l, B_h, [dst, #32]
-	stp	C_l, C_h, [dst, #48]
-	stp	D_l, D_h, [dst, #64]
-	add	src, src, #16
-	add	dst, dst, #64 + 16
-	tst	count, #0x3f
-	b.ne	L(tail63down)
-	RET
-END (memmove)
-
-libc_hidden_builtin_def (memmove)