From patchwork Thu May  3 17:52:09 2018
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Siddhesh Poyarekar <siddhesh@sourceware.org>
X-Patchwork-Id: 27086
Received: (qmail 6132 invoked by alias); 3 May 2018 17:52:45 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 6014 invoked by uid 89); 3 May 2018 17:52:44 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-26.1 required=5.0 tests=BAYES_00, GIT_PATCH_0,
	GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE,
	SPF_NEUTRAL autolearn=ham version=3.3.2 spammy=2-3, 3520
X-HELO: homiemail-a56.g.dreamhost.com
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
To: libc-alpha@sourceware.org
Subject: [PATCH 2/2] Ignore prefetcher tagging for smaller copies
Date: Thu,  3 May 2018 23:22:09 +0530
Message-Id: <20180503175209.2943-3-siddhesh@sourceware.org>
In-Reply-To: <20180503175209.2943-1-siddhesh@sourceware.org>
References: <20180503175209.2943-1-siddhesh@sourceware.org>

For smaller and medium sized copies, the effect of hardware
prefetching are not as dominant as instruction level parallelism.
Hence it makes more sense to load data into multiple registers than to
try and route them to the same prefetch unit.  This is also the case
for the loop exit where we are unable to latch on to the same prefetch
unit anyway so it makes more sense to have data loaded in parallel.

The performance results are a bit mixed with memcpy-random, with
numbers jumping between -1% and +3%, i.e. the numbers don't seem
repeatable.  memcpy-walk sees a 70% improvement (i.e. > 2x) for 128
bytes and that improvement reduces down as the impact of the tail copy
decreases in comparison to the loop.

	* sysdeps/aarch64/multiarch/memcpy_falkor.S (B_l, B_lw, C_l,
	D_l, E_l, F_l, G_l, A_h, B_h, C_h, D_h, E_h, F_h, G_h): New
	macros.
	(__memcpy_falkor): Use multiple registers to copy data in loop
	tail.

 sysdeps/aarch64/multiarch/memcpy_falkor.S | 68 +++++++++++++++++++------------
 1 file changed, 41 insertions(+), 27 deletions(-)

diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
index 8dd8c1e03a..2fe9937f11 100644
--- a/sysdeps/aarch64/multiarch/memcpy_falkor.S
+++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
@@ -35,6 +35,20 @@
 #define A_hw	w7
 #define tmp1	x14
 
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	dst
+#define E_h	tmp1
+#define F_l	src
+#define F_h	count
+#define G_l	srcend
+#define G_h	x15
+
 /* Copies are split into 3 main cases:
 
    1. Small copies of up to 32 bytes
@@ -74,21 +88,21 @@ ENTRY_ALIGN (__memcpy_falkor, 6)
 	/* Medium copies: 33..128 bytes.  */
 	sub	tmp1, count, 1
 	ldp	A_l, A_h, [src, 16]
-	stp	A_l, A_h, [dstin, 16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -16]
 	tbz	tmp1, 6, 1f
-	ldp	A_l, A_h, [src, 32]
-	stp	A_l, A_h, [dstin, 32]
-	ldp	A_l, A_h, [src, 48]
-	stp	A_l, A_h, [dstin, 48]
-	ldp	A_l, A_h, [srcend, -64]
-	stp	A_l, A_h, [dstend, -64]
-	ldp	A_l, A_h, [srcend, -48]
-	stp	A_l, A_h, [dstend, -48]
+	ldp	D_l, D_h, [src, 32]
+	ldp	E_l, E_h, [src, 48]
+	stp	D_l, D_h, [dstin, 32]
+	stp	E_l, E_h, [dstin, 48]
+	ldp	F_l, F_h, [srcend, -64]
+	ldp	G_l, G_h, [srcend, -48]
+	stp	F_l, F_h, [dstend, -64]
+	stp	G_l, G_h, [dstend, -48]
 1:
-	ldp	A_l, A_h, [srcend, -32]
-	stp	A_l, A_h, [dstend, -32]
-	ldp	A_l, A_h, [srcend, -16]
-	stp	A_l, A_h, [dstend, -16]
+	stp	A_l, A_h, [dstin, 16]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
 	ret
 
 	.p2align 4
@@ -98,36 +112,36 @@ L(copy32):
 	cmp	count, 16
 	b.lo	1f
 	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [srcend, -16]
 	stp	A_l, A_h, [dstin]
-	ldp	A_l, A_h, [srcend, -16]
-	stp	A_l, A_h, [dstend, -16]
+	stp	B_l, B_h, [dstend, -16]
 	ret
 	.p2align 4
 1:
 	/* 8-15 */
 	tbz	count, 3, 1f
 	ldr	A_l, [src]
+	ldr	B_l, [srcend, -8]
 	str	A_l, [dstin]
-	ldr	A_l, [srcend, -8]
-	str	A_l, [dstend, -8]
+	str	B_l, [dstend, -8]
 	ret
 	.p2align 4
 1:
 	/* 4-7 */
 	tbz	count, 2, 1f
 	ldr	A_lw, [src]
+	ldr	B_lw, [srcend, -4]
 	str	A_lw, [dstin]
-	ldr	A_lw, [srcend, -4]
-	str	A_lw, [dstend, -4]
+	str	B_lw, [dstend, -4]
 	ret
 	.p2align 4
 1:
 	/* 2-3 */
 	tbz	count, 1, 1f
 	ldrh	A_lw, [src]
+	ldrh	B_lw, [srcend, -2]
 	strh	A_lw, [dstin]
-	ldrh	A_lw, [srcend, -2]
-	strh	A_lw, [dstend, -2]
+	strh	B_lw, [dstend, -2]
 	ret
 	.p2align 4
 1:
@@ -171,12 +185,12 @@ L(loop64):
 L(last64):
 	ldp	A_l, A_h, [srcend, -64]
 	stnp	A_l, A_h, [dstend, -64]
-	ldp	A_l, A_h, [srcend, -48]
-	stnp	A_l, A_h, [dstend, -48]
-	ldp	A_l, A_h, [srcend, -32]
-	stnp	A_l, A_h, [dstend, -32]
-	ldp	A_l, A_h, [srcend, -16]
-	stnp	A_l, A_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -48]
+	stnp	B_l, B_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -32]
+	stnp	C_l, C_h, [dstend, -32]
+	ldp	D_l, D_h, [srcend, -16]
+	stnp	D_l, D_h, [dstend, -16]
 	ret
 
 END (__memcpy_falkor)