[v2] aarch64: Optimized strlen for strlen_asimd

Message ID VI1PR0801MB212778B7354860C87DA03F3C83680@VI1PR0801MB2127.eurprd08.prod.outlook.com
State Committed
Commit c2150769d015dca1805334af7743829f1e4c0b6a
Headers

Commit Message

Wilco Dijkstra Oct. 22, 2019, 4:33 p.m. UTC
  Hi Xuelei,

> Optimize the strlen implementation by using vector operations and
> loop unrolling in main loop.Compared to __strlen_generic,it reduces
> latency of cases in bench-strlen by 7%~18% when the length of src
> is greater than 128 bytes, with gains throughout the benchmark.

This is a good improvement, OK to commit. Also given it uses integer
arithmetic for the first 16 bytes, it can never be worse off than the generic
variant for small inputs.

Wilco


OK
  

Comments

Adhemerval Zanella Dec. 19, 2019, 7:44 p.m. UTC | #1
On 22/10/2019 13:33, Wilco Dijkstra wrote:
> Hi Xuelei,
> 
>> Optimize the strlen implementation by using vector operations and
>> loop unrolling in main loop.Compared to __strlen_generic,it reduces
>> latency of cases in bench-strlen by 7%~18% when the length of src
>> is greater than 128 bytes, with gains throughout the benchmark.
> 
> This is a good improvement, OK to commit. Also given it uses integer
> arithmetic for the first 16 bytes, it can never be worse off than the generic
> variant for small inputs.
> 
> Wilco

I pushed it upstream.
  

Patch

diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
index 1db01babeec..abf6513eeea 100644
--- a/sysdeps/aarch64/multiarch/strlen.c
+++ b/sysdeps/aarch64/multiarch/strlen.c
@@ -34,7 +34,9 @@  extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
 
 libc_ifunc (__strlen,
-	    (USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic));
+	    (USE_ASIMD_STRLEN () || IS_KUNPENG(midr)
+	    ? __strlen_asimd
+	    :__strlen_generic));
 
 # undef strlen
 strong_alias (__strlen, strlen);
diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
index 1d1c6abb825..1de6cd3a173 100644
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
@@ -48,6 +48,9 @@ 
 #define dataq2		q3
 #define datav2		v3
 
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
 #ifdef TEST_PAGE_CROSS
 # define MIN_PAGE_SIZE 16
 #else
@@ -82,40 +85,47 @@  ENTRY_ALIGN (__strlen_asimd, 6)
 	DELOUSE (0)
 	DELOUSE (1)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	mov	zeroones, REP8_01
 	cmp	tmp1, MIN_PAGE_SIZE - 16
 	b.gt	L(page_cross)
-	ldr	dataq, [srcin]
+	ldp	data1, data2, [srcin]
 #ifdef __AARCH64EB__
-	rev64	datav.16b, datav.16b
+	rev	data1, data1
+	rev	data2, data2
 #endif
 
-	/* Get the minimum value and keep going if it is not zero.  */
-	uminv	datab2, datav.16b
-	mov	tmp1, datav2.d[0]
-	cbnz	tmp1, L(main_loop_entry)
-
-	cmeq	datav.16b, datav.16b, #0
-	mov	data1, datav.d[0]
-	mov	data2, datav.d[1]
-	cmp	data1, 0
-	csel	data1, data1, data2, ne
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(main_loop_entry)
+	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 8
-	rev	data1, data1
-	clz	tmp1, data1
-	csel	len, xzr, len, ne
+	rev	has_nul1, has_nul1
+	clz	tmp1, has_nul1
+	csel	len, xzr, len, cc
 	add	len, len, tmp1, lsr 3
 	ret
 
 L(main_loop_entry):
 	bic	src, srcin, 15
+	sub	src, src, 16
 
 L(main_loop):
-	ldr	dataq, [src, 16]!
+	ldr	dataq, [src, 32]!
 L(page_cross_entry):
 	/* Get the minimum value and keep going if it is not zero.  */
 	uminv	datab2, datav.16b
 	mov	tmp1, datav2.d[0]
+	cbz	tmp1, L(tail)
+	ldr	dataq, [src, 16]
+	uminv	datab2, datav.16b
+	mov	tmp1, datav2.d[0]
 	cbnz	tmp1, L(main_loop)
+	add	src, src, 16
 
 L(tail):
 #ifdef __AARCH64EB__