From patchwork Wed Dec 17 12:12:25 2014
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Richard Earnshaw <rearnsha@arm.com>
X-Patchwork-Id: 4298
Received: (qmail 32108 invoked by alias); 17 Dec 2014 12:12:33 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 32092 invoked by uid 89); 17 Dec 2014 12:12:31 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-1.0 required=5.0 tests=AWL, BAYES_00,
	SPF_PASS autolearn=ham version=3.3.2
X-HELO: service87.mimecast.com
Message-ID: <54917329.4090601@arm.com>
Date: Wed, 17 Dec 2014 12:12:25 +0000
From: Richard Earnshaw <rearnsha@arm.com>
User-Agent: Mozilla/5.0 (X11; Linux x86_64;
	rv:31.0) Gecko/20100101 Thunderbird/31.3.0
MIME-Version: 1.0
To: Glibc Development List <libc-alpha@sourceware.org>
Subject: [Patch, AArch64] Optimized strcpy
X-MC-Unique: 114121712122615701

This patch contains an optimized implementation of strcpy for AArch64
systems.  Benchmarking shows that it is approximately 20-25% faster than
the generic implementation across the board.

R.

<date>  Richard Earnshaw  <rearnsha@arm.com>

	* sysdeps/aarch64/strcpy.S: New file.
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
new file mode 100644
index 0000000..1cdf2a1
--- /dev/null
+++ b/sysdeps/aarch64/strcpy.S
@@ -0,0 +1,202 @@
+/* Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ */
+
+/* Arguments and results.  */
+#define dstin		x0
+#define src		x1
+
+/* Locals and temporaries.  */
+#define dst		x2
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define has_nul1	x5
+#define has_nul2	x6
+#define tmp1		x7
+#define tmp2		x8
+#define tmp3		x9
+#define tmp4		x10
+#define zeroones	x11
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+	/* Start of critial section -- keep to one 64Byte cache line.  */
+ENTRY_ALIGN (strcpy,6)
+	mov	zeroones, #REP8_01
+	mov	dst, dstin
+	ands	tmp1, src, #15
+	b.ne	L(misaligned)
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	/* The inner loop deals with two Dwords at a time.  This has a
+	   slightly higher start-up cost, but we should win quite quickly,
+	   especially on cores with a high number of issue slots per
+	   cycle, as we get much better parallelism out of the operations.  */
+	b	L(first_pass)
+L(main_loop):
+	stp	data1, data2, [dst], #16
+L(startloop_fast):
+	ldp	data1, data2, [src], #16
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	L(main_loop)
+	/* End of critical section -- keep to one 64Byte cache line.  */
+
+	cbnz	has_nul1, L(nul_in_data1_fast)
+L(nul_in_data2_fast):
+	str	data1, [dst], #8
+L(nul_in_data2_fast_after_d1):
+	/* For a NUL in data2, we always know that we've moved at least 8
+	   bytes, so no need for a slow path.  */
+#ifdef __AARCH64EB__
+	/* For big-endian only, carry propagation means we can't trust
+	   the MSB of the syndrome value calculated above (the byte
+	   sequence 01 00 will generate a syndrome of 80 80 rather than
+	   00 80).  We get around this by byte-swapping the data and
+	   re-calculating.  */
+	rev	data2, data2
+	sub	tmp1, data2, zeroones
+	orr	tmp2, data2, #REP8_7f
+	bic	has_nul2, tmp1, tmp2
+#endif
+	rev	has_nul2, has_nul2
+	sub	src, src, #(8+7)
+	clz	has_nul2, has_nul2
+	lsr	has_nul2, has_nul2, #3		/* Bits to bytes.  */
+	sub	dst, dst, #7
+	ldr	data2, [src, has_nul2]
+	str	data2, [dst, has_nul2]
+	ret
+
+L(nul_in_data1_fast):
+	/* Since we know we've already copied at least 8 bytes, we can
+	   safely handle the tail with one misaligned dword move.  To do this
+	   we calculate the location of the trailing NUL byte and go seven
+	   bytes back from that.  */
+#ifdef __AARCH64EB__
+	/* For big-endian only, carry propagation means we can't trust
+	   the MSB of the syndrome value calculated above (the byte
+	   sequence 01 00 will generate a syndrome of 80 80 rather than
+	   00 80).  We get around this by byte-swapping the data and
+	   re-calculating.  */
+	rev	data1, data1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+#endif
+	rev	has_nul1, has_nul1
+	sub	src, src, #(16+7)
+	clz	has_nul1, has_nul1
+	lsr	has_nul1, has_nul1, #3		/* Bits to bytes.  */
+	sub	dst, dst, #7
+	ldr	data1, [src, has_nul1]
+	str	data1, [dst, has_nul1]
+	ret
+
+L(first_pass):
+	ldp	data1, data2, [src], #16
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	L(main_loop)
+
+	cbz	has_nul1, L(nul_in_data2_fast)
+L(nul_in_data1):
+	/* Slow path.  We can't be sure we've moved at least 8 bytes, so
+	   fall back to a slow byte-by byte store of the bits already
+	   loaded.
+
+	   The worst case when coming through this path is that we've had
+	   to copy seven individual bytes to get to alignment and we then
+	   have to copy another seven (eight for big-endian) again here.
+	   We could try to detect that case (and any case where more than
+	   eight bytes have to be copied), but it really doesn't seem
+	   worth it.  */
+#ifdef __AARCH64EB__
+	rev	data1, data1
+#else
+	/* On little-endian, we can easily check if the NULL byte was
+	   in the last byte of the Dword.  For big-endian we'd have to
+	   recalculate the syndrome, which is unlikely to be worth it.  */
+	lsl	has_nul1, has_nul1, #8
+	cbnz	has_nul1, 1f
+	str	data1, [dst]
+	ret
+#endif
+1:
+	strb	data1w, [dst], #1
+	tst	data1, #0xff
+	lsr	data1, data1, #8
+	b.ne	1b
+L(done):
+	ret
+
+L(misaligned):
+	cmp	tmp1, #8
+	b.ge	2f
+	/* There's at least one Dword before we reach alignment, so we can
+	   deal with that efficiently.  */
+	ldr	data1, [src]
+	bic	src, src, #15
+	sub	tmp3, data1, zeroones
+	orr	tmp4, data1, #REP8_7f
+	bics	has_nul1, tmp3, tmp4
+	b.ne	L(nul_in_data1)
+	str	data1, [dst], #8
+	ldr	data2, [src, #8]
+	add	src, src, #16
+	sub	dst, dst, tmp1
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bics	has_nul2, tmp3, tmp4
+	b.ne	L(nul_in_data2_fast_after_d1)
+	str	data2, [dst], #8
+	/* We can by-pass the first-pass version of the loop in this case
+	   since we know that at least 8 bytes have already been copied.  */
+	b	L(startloop_fast)
+
+2:
+	sub	tmp1, tmp1, #16
+3:
+	ldrb	data1w, [src], #1
+	strb	data1w, [dst], #1
+	cbz	data1w, L(done)
+	add	tmp1, tmp1, #1
+	cbnz	tmp1, 3b
+	b	L(first_pass)
+END (strcpy)
+libc_hidden_builtin_def (strcpy)