From patchwork Thu May  3 17:52:08 2018
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Siddhesh Poyarekar <siddhesh@sourceware.org>
X-Patchwork-Id: 27085
Received: (qmail 4962 invoked by alias); 3 May 2018 17:52:36 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 4841 invoked by uid 89); 3 May 2018 17:52:35 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-26.1 required=5.0 tests=BAYES_00, GIT_PATCH_0,
	GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE,
	SPF_NEUTRAL autolearn=ham version=3.3.2 spammy=1507, peel,
	Hx-languages-length:3534
X-HELO: homiemail-a56.g.dreamhost.com
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
To: libc-alpha@sourceware.org
Subject: [PATCH 1/2] aarch64,
	falkor: Ignore prefetcher hints for memmove tail
Date: Thu,  3 May 2018 23:22:08 +0530
Message-Id: <20180503175209.2943-2-siddhesh@sourceware.org>
In-Reply-To: <20180503175209.2943-1-siddhesh@sourceware.org>
References: <20180503175209.2943-1-siddhesh@sourceware.org>

The tail of the copy loops are unable to train the falkor hardware
prefetcher because they load from a different base compared to the hot
loop.  In this case avoid serializing the instructions by loading them
into different registers.  Also peel the last iteration of the loop
into the tail (and have them use different registers) since it gives
better performance for medium sizes.

This results in performance improvements of between 3% and 20% over
the current falkor implementation for sizes between 128 bytes and 1K
on the memmove-walk benchmark, thus mostly covering the regressions
seen against the generic memmove.

	* sysdeps/aarch64/multiarch/memmove_falkor.S
	(__memmove_falkor): Use multiple registers to move data in
	loop tail.
---
 sysdeps/aarch64/multiarch/memmove_falkor.S | 48 ++++++++++++++++++------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/sysdeps/aarch64/multiarch/memmove_falkor.S b/sysdeps/aarch64/multiarch/memmove_falkor.S
index 3375adf2de..c0d9560301 100644
--- a/sysdeps/aarch64/multiarch/memmove_falkor.S
+++ b/sysdeps/aarch64/multiarch/memmove_falkor.S
@@ -150,7 +150,6 @@ L(copy96):
 
 	.p2align 4
 L(copy_long):
-	sub	count, count, 64 + 16	/* Test and readjust count.  */
 	mov	B_l, Q_l
 	mov	B_h, Q_h
 	ldp	A_l, A_h, [src]
@@ -161,6 +160,8 @@ L(copy_long):
 	ldp	Q_l, Q_h, [src, 16]!
 	stp	A_l, A_h, [dstin]
 	ldp	A_l, A_h, [src, 16]!
+	subs	count, count, 32 + 64 + 16	/* Test and readjust count.  */
+	b.ls	L(last64)
 
 L(loop64):
 	subs	count, count, 32
@@ -170,18 +171,22 @@ L(loop64):
 	ldp	A_l, A_h, [src, 16]!
 	b.hi	L(loop64)
 
-	/* Write the last full set of 32 bytes.  The remainder is at most 32
-	   bytes, so it is safe to always copy 32 bytes from the end even if
-	   there is just 1 byte left.  */
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes and at least 33 bytes, so it is safe to always copy 64 bytes
+	   from the end.  */
 L(last64):
-	ldp	C_l, C_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -64]
 	stp	Q_l, Q_h, [dst, 16]
-	ldp	Q_l, Q_h, [srcend, -16]
+	mov	Q_l, B_l
+	mov	Q_h, B_h
+	ldp	B_l, B_h, [srcend, -48]
 	stp	A_l, A_h, [dst, 32]
-	stp	C_l, C_h, [dstend, -32]
-	stp	Q_l, Q_h, [dstend, -16]
-	mov	Q_l, B_l
-	mov	Q_h, B_h
+	ldp	A_l, A_h, [srcend, -32]
+	ldp	D_l, D_h, [srcend, -16]
+	stp	C_l, C_h, [dstend, -64]
+	stp	B_l, B_h, [dstend, -48]
+	stp	A_l, A_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
 	ret
 
 	.p2align 4
@@ -204,7 +209,8 @@ L(move_long):
 	sub	count, count, tmp1
 	ldp	A_l, A_h, [srcend, -16]!
 	sub	dstend, dstend, tmp1
-	sub	count, count, 64
+	subs	count, count, 32 + 64
+	b.ls	2f
 
 1:
 	subs	count, count, 32
@@ -214,18 +220,22 @@ L(move_long):
 	ldp	A_l, A_h, [srcend, -16]!
 	b.hi	1b
 
-	/* Write the last full set of 32 bytes.  The remainder is at most 32
-	   bytes, so it is safe to always copy 32 bytes from the start even if
-	   there is just 1 byte left.  */
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes and at least 33 bytes, so it is safe to always copy 64 bytes
+	   from the start.  */
 2:
-	ldp	C_l, C_h, [src, 16]
+	ldp	C_l, C_h, [src, 48]
 	stp	Q_l, Q_h, [dstend, -16]
-	ldp	Q_l, Q_h, [src]
-	stp	A_l, A_h, [dstend, -32]
-	stp	C_l, C_h, [dstin, 16]
-	stp	Q_l, Q_h, [dstin]
 	mov	Q_l, B_l
 	mov	Q_h, B_h
+	ldp	B_l, B_h, [src, 32]
+	stp	A_l, A_h, [dstend, -32]
+	ldp	A_l, A_h, [src, 16]
+	ldp	D_l, D_h, [src]
+	stp	C_l, C_h, [dstin, 48]
+	stp	B_l, B_h, [dstin, 32]
+	stp	A_l, A_h, [dstin, 16]
+	stp	D_l, D_h, [dstin]
 3:	ret
 
 END (__memmove_falkor)