From patchwork Thu Jun 22 18:58:49 2017
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Siddhesh Poyarekar <siddhesh@sourceware.org>
X-Patchwork-Id: 21208
Received: (qmail 69012 invoked by alias); 22 Jun 2017 18:59:04 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 68947 invoked by uid 89); 22 Jun 2017 18:59:02 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-26.1 required=5.0 tests=BAYES_00, GIT_PATCH_0,
	GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE,
	SPF_NEUTRAL autolearn=ham version=3.3.2 spammy=0.6, deducted
X-HELO: homiemail-a50.g.dreamhost.com
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
To: libc-alpha@sourceware.org
Subject: [PATCH] aarch64: Optimized memcpy for Qualcomm Falkor processor
Date: Fri, 23 Jun 2017 00:28:49 +0530
Message-Id: <1498157929-23554-1-git-send-email-siddhesh@sourceware.org>

This is an optimized memcpy implementation for the Qualcomm Falkor
processor.  The implementation improves specINT in SPEC2006 by 0.6%
with omnetpp and xalancbmk leading at 6% and the overall impact being
mostly positive on all benchmarks.  With the glibc microbenchmarks the
large copy benchmarks suffer slightly but bench-memcpy-random improves
throughout by about 5%.

	* sysdeps/aarch64/multiarch/Makefile (sysdep-routines): Add
	memcpy_falkor.
	* sysdeps/aarch64/multiarch/ifunc-impl-list.c (MAX_IFUNC):
	Bump.
	(__libc_ifunc_impl_list): Add __memcpy_falkor.
	* sysdeps/aarch64/multiarch/memcpy.c (libc_ifunc): Likewise.
	* sysdeps/unix/sysv/linux/aarch64/cpu-features.h (IS_FALKOR):
	New macro.
	* sysdeps/aarch64/multiarch/memcpy_falkor.S: New file.
---
 sysdeps/aarch64/multiarch/Makefile             |   2 +-
 sysdeps/aarch64/multiarch/ifunc-impl-list.c    |   3 +-
 sysdeps/aarch64/multiarch/memcpy.c             |   7 +-
 sysdeps/aarch64/multiarch/memcpy_falkor.S      | 294 +++++++++++++++++++++++++
 sysdeps/unix/sysv/linux/aarch64/cpu-features.h |   3 +
 5 files changed, 306 insertions(+), 3 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/memcpy_falkor.S

diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 78d52c7..164ba1a 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,3 +1,3 @@
 ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 32056bc..8e873b3 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -25,7 +25,7 @@
 #include <stdio.h>
 
 /* Maximum number of IFUNC implementations.  */
-#define MAX_IFUNC	2
+#define MAX_IFUNC	3
 
 size_t
 __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
@@ -40,6 +40,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c.  */
   IFUNC_IMPL (i, name, memcpy,
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
   IFUNC_IMPL (i, name, memmove,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 9f73efb..b395df1 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -30,9 +30,14 @@ extern __typeof (__redirect_memcpy) __libc_memcpy;
 
 extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
 
 libc_ifunc (__libc_memcpy,
-            IS_THUNDERX (midr) ? __memcpy_thunderx : __memcpy_generic);
+            (IS_THUNDERX (midr)
+	     ? __memcpy_thunderx
+	     : (IS_FALKOR (midr)
+		? __memcpy_falkor
+		: __memcpy_generic)));
 
 # undef memcpy
 strong_alias (__libc_memcpy, memcpy);
diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
new file mode 100644
index 0000000..414adb4
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
@@ -0,0 +1,294 @@
+/* A Generic Optimized memcpy implementation for AARCH64.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define A_hw	w7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	src
+#define E_h	count
+#define F_l	srcend
+#define F_h	dst
+#define tmp1	x14
+#define res	x15
+
+#include <sysdep.h>
+
+/* Copy 64 bytes at a time and branch to LABEL on COND.  */
+.macro copy_line_and_branch cond, label
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.\cond	\label
+.endm
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   Small and medium copies read all data before writing, allowing any
+   kind of overlap, and memmove tailcalls memcpy for these cases as
+   well as non-overlapping copies.  */
+
+ENTRY_ALIGN (__memcpy_falkor, 6)
+
+	prfm	PLDL1KEEP, [src]
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 16
+	b.ls	L(copy16)
+	cmp	count, 96
+	b.hi	L(copy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldp	A_l, A_h, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+1:
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	cmp	count, 8
+	b.lo	1f
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+	.p2align 4
+1:
+	tbz	count, 2, 1f
+	ldr	A_lw, [src]
+	ldr	A_hw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	A_hw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+1:
+	cbz	count, 2f
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
+2:	ret
+
+	.p2align 4
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [src, 32]
+	ldp	D_l, D_h, [src, 48]
+	ldp	E_l, E_h, [srcend, -32]
+	ldp	F_l, F_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin, 32]
+	stp	D_l, D_h, [dstin, 48]
+	stp	E_l, E_h, [dstend, -32]
+	stp	F_l, F_h, [dstend, -16]
+	ret
+
+	/* Align DST to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.	 There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.	The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 4
+L(copy_long):
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	5f
+
+	/* Unroll the copy for 512 bytes.  We do this so that smaller copies
+	   don't get penalized by the extra checks we do for larger sizes
+	   further down.  */
+	copy_line_and_branch	ls, 5f
+	copy_line_and_branch	ls, 5f
+	copy_line_and_branch	ls, 5f
+	copy_line_and_branch	ls, 5f
+	copy_line_and_branch	ls, 5f
+	copy_line_and_branch	ls, 5f
+	copy_line_and_branch	ls, 5f
+	copy_line_and_branch	ls, 5f
+
+	/* If less than 2048 bytes remain, jump to the final loop and finish
+	   off the copy.  Otherwise, keep the last 2048 bytes for the final
+	   loop and try a couple of prefetching loops to optimize cache
+	   usage.  */
+	subs	count, count, 2048
+	b.hi	3f
+
+6:
+	/* 2048 bytes or less remaining, adjust COUNT and copy 64 bytes at a
+	   time.  */
+	add	count, count, 2048
+4:
+	copy_line_and_branch	hi, 4b
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
+5:
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	/* Find an offset within the source that operates on a memory bank
+	   other than the one used by the source and destination pointers.  If
+	   we find one within the 1K to 4K range, then we can prefetch at two
+	   offsets to stream more data in from the source.  This computation
+	   formula is provided by Qualcomm.  */
+3:
+	sub	res, src, dst
+	sub	res, res, 1024
+	and	res, res, 0x7ff
+	subs	tmp1, count, res
+	b.hi	7f
+	/* We still have more than 2K bytes remaining, so copy 128 bytes at a
+	   time, prefetching at 2K-128 for every iteration until there is less
+	   than 2K left.  That way we make future data available in L1 and at
+	   the same time, limit our prefetch to within the source data.  */
+2:
+	prfm	PLDL1STRM, [src, 1920]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]
+	ldp	D_l, D_h, [src, 64]
+	stp	A_l, A_h, [dst, 80]
+	ldp	A_l, A_h, [src, 80]
+	stp	B_l, B_h, [dst, 96]
+	ldp	B_l, B_h, [src, 96]
+	stp	C_l, C_h, [dst, 112]
+	ldp	C_l, C_h, [src, 112]
+	stp	D_l, D_h, [dst, 128]!
+	ldp	D_l, D_h, [src, 128]!
+	subs	count, count, 128
+	b.hi	2b
+	b	6b
+
+7:
+	add	res, res, 2048
+
+	/* We found an appropriate offset.  Copy 128 bytes at a time,
+	   prefetching at 2K and the computed offset while the computed offset
+	   is within the source data.  */
+1:
+	prfm	PLDL1STRM, [src, 2048]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]
+	ldp	D_l, D_h, [src, 64]
+	prfm	PLDL1STRM, [src, res]
+	stp	A_l, A_h, [dst, 80]
+	ldp	A_l, A_h, [src, 80]
+	stp	B_l, B_h, [dst, 96]
+	ldp	B_l, B_h, [src, 96]
+	stp	C_l, C_h, [dst, 112]
+	ldp	C_l, C_h, [src, 112]
+	stp	D_l, D_h, [dst, 128]!
+	ldp	D_l, D_h, [src, 128]!
+	subs	tmp1, tmp1, 128
+	b.hi	1b
+
+	/* Update count once the loop is done.  Subtract the 2048 added to RES
+	   for the prefetch offset to account for the 2048 we deducted from
+	   COUNT earlier.  We will have copied:
+
+	   (COUNT - RES) - TMP1
+
+	   bytes so the remaining size is COUNT - (COUNT - RES - TMP1)
+
+	   i.e. RES + TMP1.  If COUNT is non-positive then we have 2048 bytes
+	   or less remaining and we jump forward to the end.  Otherwise we
+	   cascade into the loop below with a single prefetch before cascading
+	   further into the last loop.  */
+	sub	res, res, 2048
+	adds	count, res, tmp1
+	b.ls	6b
+	b	2b
+
+END (__memcpy_falkor)
+libc_hidden_builtin_def (__memcpy_falkor)
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index c92b650..73cb53d 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -41,6 +41,9 @@
 #define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C'	\
 			   && MIDR_PARTNUM(midr) == 0x0a1)
 
+#define IS_FALKOR(midr) (MIDR_IMPLEMENTOR(midr) == 'Q'			      \
+                        && MIDR_PARTNUM(midr) == 0xc00)
+
 struct cpu_features
 {
   uint64_t midr_el1;