[v2] aarch64: Optimized implementation of strnlen

Message ID	20191022094003.9612-1-zhangxuelei4@huawei.com
State	Committed
Headers	Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org From: Xuelei Zhang <zhangxuelei4@huawei.com> To: <libc-alpha@sourceware.org>, <siddhesh@gotplt.org>, <Szabolcs.Nagy@arm.com>, <Wilco.Dijkstra@arm.com>, <jiangyikun@huawei.com>, <yikunkero@gmail.com> Subject: [PATCH v2] aarch64: Optimized implementation of strnlen Date: Tue, 22 Oct 2019 17:40:03 +0800 Message-ID: <20191022094003.9612-1-zhangxuelei4@huawei.com> MIME-Version: 1.0 Content-Type: text/plain

Message ID

20191022094003.9612-1-zhangxuelei4@huawei.com

State

Committed

Headers

Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
Sender: libc-alpha-owner@sourceware.org
From: Xuelei Zhang <zhangxuelei4@huawei.com>
To: <libc-alpha@sourceware.org>, <siddhesh@gotplt.org>,
	<Szabolcs.Nagy@arm.com>, <Wilco.Dijkstra@arm.com>,
	<jiangyikun@huawei.com>, <yikunkero@gmail.com>
Subject: [PATCH v2] aarch64: Optimized implementation of strnlen
Date: Tue, 22 Oct 2019 17:40:03 +0800
Message-ID: <20191022094003.9612-1-zhangxuelei4@huawei.com>
MIME-Version: 1.0
Content-Type: text/plain

Commit Message

Xuelei Zhang Oct. 22, 2019, 9:40 a.m. UTC

  Optimize the strlen implementation by using vector operations and
loop unrooling in main loop. Compared to aarch64/strnlen.S, it
reduces latency of cases in bench-strnlen by 11%~24% when the length
of src is greater than 64 bytes, with gains throughout the benchmark.
---
 sysdeps/aarch64/strnlen.S | 52 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

Comments

Wilco Dijkstra Oct. 22, 2019, 5:57 p.m. UTC | #1

Hi Xuelei,

> Optimize the strlen implementation by using vector operations and
> loop unrooling in main loop. Compared to aarch64/strnlen.S, it
> reduces latency of cases in bench-strnlen by 11%~24% when the length
> of src is greater than 64 bytes, with gains throughout the benchmark.

Like with strcpy, this improves performance on various microarchitectures,
so this is OK.

Wilco

Adhemerval Zanella Dec. 19, 2019, 7:44 p.m. UTC | #2

On 22/10/2019 14:57, Wilco Dijkstra wrote:
> Hi Xuelei,
> 
>> Optimize the strlen implementation by using vector operations and
>> loop unrooling in main loop. Compared to aarch64/strnlen.S, it
>> reduces latency of cases in bench-strnlen by 11%~24% when the length
>> of src is greater than 64 bytes, with gains throughout the benchmark.
> 
> Like with strcpy, this improves performance on various microarchitectures,
> so this is OK.
> 
> Wilco

I pushed it upstream.

diff mbox

Patch

diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 70283c80749..a57753b0a28 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -45,6 +45,11 @@ 
 #define pos		x13
 #define limit_wd	x14
 
+#define dataq		q2
+#define datav		v2
+#define datab2		b3
+#define dataq2		q3
+#define datav2		v3
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
@@ -71,7 +76,7 @@  ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)
 	   cycle, as we get much better parallelism out of the operations.  */
 
 	/* Start of critial section -- keep to one 64Byte cache line.  */
-L(loop):
+
 	ldp	data1, data2, [src], #16
 L(realigned):
 	sub	tmp1, data1, zeroones
@@ -119,6 +124,51 @@  L(nul_in_data2):
 	csel	len, len, limit, ls		/* Return the lower value.  */
 	RET
 
+L(loop):
+	ldr	dataq, [src], #16
+	uminv	datab2, datav.16b
+	mov	tmp1, datav2.d[0]
+	subs	limit_wd, limit_wd, #1
+	ccmp	tmp1, #0, #4, pl	/* NZCV = 0000  */
+	b.eq	L(loop_end)
+	ldr	dataq, [src], #16
+	uminv	datab2, datav.16b
+	mov	tmp1, datav2.d[0]
+	subs	limit_wd, limit_wd, #1
+	ccmp	tmp1, #0, #4, pl	/* NZCV = 0000  */
+	b.ne	L(loop)
+L(loop_end):
+	/* End of critical section -- keep to one 64Byte cache line.  */
+
+	cbnz	tmp1, L(hit_limit)	/* No null in final Qword.  */
+
+	/* We know there's a null in the final Qword.  The easiest thing
+	   to do now is work out the length of the string and return
+	   MIN (len, limit).  */
+
+#ifdef __AARCH64EB__
+	rev64	datav.16b, datav.16b
+#endif
+	/* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
+	   pair of scalars and then compute the length from the earliest NULL
+	   byte.  */
+
+	cmeq	datav.16b, datav.16b, #0
+	mov	data1, datav.d[0]
+	mov	data2, datav.d[1]
+	cmp	data1, 0
+	csel	data1, data1, data2, ne
+	sub	len, src, srcin
+	sub	len, len, #16
+	rev	data1, data1
+	add	tmp2, len, 8
+	clz	tmp1, data1
+	csel	len, len, tmp2, ne
+	add	len, len, tmp1, lsr 3
+	cmp	len, limit
+	csel	len, len, limit, ls		/* Return the lower value.  */
+	RET
+
 L(misaligned):
 	/* Deal with a partial first word.
 	   We're doing two things in parallel here;