[v2] aarch64: Optimized strlen for strlen_asimd

Message ID 20191022094118.11468-1-zhangxuelei4@huawei.com
State Committed
Headers

Commit Message

Xuelei Zhang Oct. 22, 2019, 9:41 a.m. UTC
  Optimize the strlen implementation by using vector operations and
loop unrolling in main loop.Compared to __strlen_generic,it reduces
latency of cases in bench-strlen by 7%~18% when the length of src
is greater than 128 bytes, with gains throughout the benchmark.

Here is the result:

Function: strlen
Variant:
                                    builtin_strlen	generic_strlen	memchr_strlen	__strlen_asimd(old)	__strlen_asimd(new)	__strlen_generic
  

Patch

=====================================================================================================================================================
               length=1, alignment=1:        20.00 (-64.10%)	       14.38 (-17.95%)	       16.25 (-33.33%)	       11.56 (  5.13%)	       11.72 (  3.85%)	       12.19
               length=1, alignment=0:        15.00 (-26.32%)	       12.66 ( -6.58%)	       16.09 (-35.53%)	       12.19 ( -2.63%)	       12.03 ( -1.32%)	       11.88
               length=2, alignment=2:        15.16 (-25.97%)	       14.06 (-16.88%)	       15.62 (-29.87%)	       12.03 (  0.00%)	       11.72 (  2.60%)	       12.03
               length=2, alignment=0:        14.53 (-20.78%)	       12.81 ( -6.49%)	       16.25 (-35.07%)	       12.66 ( -5.19%)	       12.03 (  0.00%)	       12.03
               length=3, alignment=3:        15.00 (-21.52%)	       14.38 (-16.46%)	       15.78 (-27.85%)	       12.03 (  2.53%)	       12.03 (  2.53%)	       12.34
               length=3, alignment=0:        14.53 (-24.00%)	       12.66 ( -8.00%)	       16.88 (-44.00%)	       12.19 ( -4.00%)	       12.03 ( -2.67%)	       11.72
               length=4, alignment=4:        14.69 (-23.68%)	       15.62 (-31.58%)	       16.25 (-36.84%)	       11.88 (  0.00%)	       11.72 (  1.32%)	       11.88
               length=4, alignment=0:        14.84 (-20.25%)	       12.66 ( -2.53%)	       16.72 (-35.44%)	       11.88 (  3.80%)	       12.19 (  1.27%)	       12.34
               length=5, alignment=5:        14.38 (-21.05%)	       14.84 (-25.00%)	       15.62 (-31.58%)	       11.88 (  0.00%)	       11.72 (  1.32%)	       11.88
               length=5, alignment=0:        14.84 (-21.80%)	       13.12 ( -7.69%)	       16.41 (-34.61%)	       12.03 (  1.28%)	       11.88 (  2.56%)	       12.19
               length=6, alignment=6:        14.69 (-25.33%)	       14.69 (-25.33%)	       15.78 (-34.67%)	       11.88 ( -1.33%)	       11.88 ( -1.33%)	       11.72
               length=6, alignment=0:        14.69 (-23.68%)	       13.28 (-11.84%)	       16.41 (-38.16%)	       12.66 ( -6.58%)	       12.34 ( -3.95%)	       11.88
               length=7, alignment=7:        14.84 (-23.38%)	       13.28 (-10.39%)	       15.78 (-31.17%)	       12.19 ( -1.30%)	       12.03 (  0.00%)	       12.03
               length=7, alignment=0:        14.53 (-19.23%)	       12.81 ( -5.13%)	       16.25 (-33.33%)	       12.03 (  1.28%)	       12.03 (  1.28%)	       12.19
               length=4, alignment=0:        14.69 (-25.33%)	       12.81 ( -9.33%)	       15.94 (-36.00%)	       11.72 (  0.00%)	       11.88 ( -1.33%)	       11.72
               length=4, alignment=7:        14.69 (-22.08%)	       13.28 (-10.39%)	       15.94 (-32.47%)	       12.03 (  0.00%)	       12.03 (  0.00%)	       12.03
               length=4, alignment=2:        15.00 (-28.00%)	       15.31 (-30.67%)	       16.09 (-37.33%)	       11.88 ( -1.33%)	       12.03 ( -2.67%)	       11.72
               length=2, alignment=2:        14.69 (-23.68%)	       14.06 (-18.42%)	       15.78 (-32.89%)	       12.03 ( -1.32%)	       12.03 ( -1.32%)	       11.88
               length=8, alignment=0:        14.84 (-26.67%)	       14.53 (-24.00%)	       16.09 (-37.33%)	       12.03 ( -2.67%)	       11.72 (  0.00%)	       11.72
               length=8, alignment=7:        14.22 (-19.74%)	       12.97 ( -9.21%)	       15.94 (-34.21%)	       12.03 ( -1.32%)	       11.72 (  1.32%)	       11.88
               length=8, alignment=3:        14.84 (-25.00%)	       17.19 (-44.74%)	       15.78 (-32.89%)	       11.88 (  0.00%)	       11.72 (  1.32%)	       11.88
               length=5, alignment=3:        15.00 (-24.68%)	       15.16 (-25.97%)	       15.94 (-32.47%)	       11.88 (  1.30%)	       12.03 (  0.00%)	       12.03
              length=16, alignment=0:        16.41 (-17.98%)	       15.47 (-11.24%)	       16.09 (-15.73%)	       12.19 ( 12.36%)	       13.59 (  2.25%)	       13.91
              length=16, alignment=7:        16.25 (-14.29%)	       15.62 ( -9.89%)	       16.09 (-13.19%)	       12.34 ( 13.19%)	       13.44 (  5.49%)	       14.22
              length=16, alignment=4:        16.09 (-17.05%)	       17.19 (-25.00%)	       15.62 (-13.64%)	       12.03 ( 12.50%)	       13.59 (  1.14%)	       13.75
              length=10, alignment=4:        15.31 (-27.27%)	       16.41 (-36.36%)	       15.78 (-31.17%)	       11.88 (  1.30%)	       12.50 ( -3.90%)	       12.03
              length=32, alignment=0:        15.94 ( -5.15%)	       18.28 (-20.62%)	       18.59 (-22.68%)	       14.22 (  6.18%)	       13.44 ( 11.34%)	       15.16
              length=32, alignment=7:        15.16 ( -4.30%)	       18.44 (-26.88%)	       17.19 (-18.28%)	       12.81 ( 11.83%)	       13.12 (  9.68%)	       14.53
              length=32, alignment=5:        15.31 ( -7.69%)	       20.94 (-47.25%)	       16.41 (-15.38%)	       12.34 ( 13.19%)	       12.81 (  9.89%)	       14.22
              length=21, alignment=5:        16.09 (-17.05%)	       18.28 (-32.95%)	       15.94 (-15.91%)	       12.03 ( 12.50%)	       13.12 (  4.55%)	       13.75
              length=64, alignment=0:        18.59 ( -4.39%)	       23.12 (-29.82%)	       19.22 ( -7.90%)	       15.62 ( 12.28%)	       15.94 ( 10.53%)	       17.81
              length=64, alignment=7:        18.12 (-10.48%)	       23.91 (-45.71%)	       19.69 (-20.00%)	       14.69 ( 10.48%)	       14.53 ( 11.43%)	       16.41
              length=64, alignment=6:        17.19 ( -1.85%)	       23.12 (-37.04%)	       24.06 (-42.59%)	       14.69 ( 12.96%)	       14.53 ( 13.89%)	       16.88
              length=42, alignment=6:        18.91 (-16.35%)	       20.16 (-24.04%)	       17.19 ( -5.77%)	       14.06 ( 13.46%)	       15.94 (  1.92%)	       16.25
             length=128, alignment=0:        21.09 (  4.25%)	       32.81 (-48.94%)	       21.72 (  1.42%)	       19.22 ( 12.77%)	       19.22 ( 12.77%)	       22.03
             length=128, alignment=7:        19.38 ( 10.14%)	       32.66 (-51.45%)	       21.72 ( -0.72%)	       19.22 ( 10.87%)	       18.44 ( 14.49%)	       21.56
             length=128, alignment=7:        18.75 ( 12.41%)	       31.09 (-45.26%)	       19.69 (  8.03%)	       19.22 ( 10.22%)	       18.44 ( 13.87%)	       21.41
              length=85, alignment=7:        21.72 (-17.80%)	       26.56 (-44.07%)	       24.22 (-31.36%)	       17.03 (  7.63%)	       16.56 ( 10.17%)	       18.44
             length=256, alignment=0:        30.16 (  3.50%)	       64.22 (-105.50%)	       25.94 ( 17.00%)	       26.88 ( 14.00%)	       26.56 ( 15.00%)	       31.25
             length=256, alignment=7:        28.75 (  7.07%)	       51.25 (-65.66%)	       28.75 (  7.07%)	       27.19 ( 12.12%)	       27.66 ( 10.61%)	       30.94
             length=256, alignment=8:        29.06 (  5.58%)	       65.47 (-112.69%)	       25.62 ( 16.75%)	       27.03 ( 12.18%)	       27.81 (  9.64%)	       30.78
             length=170, alignment=8:        24.53 (  4.85%)	       38.28 (-48.48%)	       22.66 ( 12.12%)	       23.59 (  8.48%)	       22.19 ( 13.94%)	       25.78
             length=512, alignment=0:        45.47 (  9.91%)	       94.22 (-86.69%)	       37.50 ( 25.70%)	       43.75 ( 13.31%)	       43.44 ( 13.93%)	       50.47
             length=512, alignment=7:        44.84 ( 10.03%)	       94.22 (-89.03%)	       38.28 ( 23.20%)	       43.91 ( 11.91%)	       44.06 ( 11.60%)	       49.84
             length=512, alignment=9:        44.53 ( 11.49%)	       97.03 (-92.86%)	       37.97 ( 24.53%)	       43.44 ( 13.66%)	       43.91 ( 12.73%)	       50.31
             length=341, alignment=9:        35.94 (  8.37%)	       71.72 (-82.87%)	       30.62 ( 21.91%)	       32.19 ( 17.93%)	       34.38 ( 12.35%)	       39.22
            length=1024, alignment=0:        78.75 ( 11.27%)	      168.28 (-89.61%)	       61.09 ( 31.16%)	      103.12 (-16.20%)	       76.41 ( 13.91%)	       88.75
            length=1024, alignment=7:        76.88 ( 11.83%)	      168.28 (-93.01%)	       62.03 ( 28.85%)	      105.94 (-21.51%)	       77.50 ( 11.11%)	       87.19
           length=1024, alignment=10:        77.81 ( 11.23%)	      170.78 (-94.83%)	       61.88 ( 29.41%)	      102.66 (-17.11%)	       77.66 ( 11.41%)	       87.66
            length=682, alignment=10:        60.31 (  9.18%)	      125.94 (-89.65%)	       45.31 ( 31.76%)	       55.16 ( 16.94%)	       58.44 ( 12.00%)	       66.41
            length=2048, alignment=0:       145.94 ( 13.84%)	      316.09 (-86.62%)	      110.78 ( 34.59%)	      143.59 ( 15.22%)	      144.69 ( 14.58%)	      169.38
            length=2048, alignment=7:       145.31 ( 16.44%)	      316.09 (-81.76%)	      111.09 ( 36.12%)	      144.53 ( 16.89%)	      143.28 ( 17.61%)	      173.91
           length=2048, alignment=11:       144.84 ( 16.86%)	      319.38 (-83.32%)	      111.25 ( 36.14%)	      144.38 ( 17.13%)	      143.59 ( 17.58%)	      174.22
           length=1365, alignment=11:       101.41 ( 17.01%)	      221.41 (-81.20%)	       78.59 ( 35.68%)	      100.94 ( 17.39%)	      100.78 ( 17.52%)	      122.19
            length=4096, alignment=0:       280.00 ( 10.62%)	      617.19 (-97.01%)	      221.88 ( 29.18%)	      301.41 (  3.79%)	      278.44 ( 11.12%)	      313.28
            length=4096, alignment=7:       283.75 ( 12.61%)	      618.44 (-90.47%)	      208.12 ( 35.90%)	      292.34 (  9.96%)	      277.81 ( 14.44%)	      324.69
           length=4096, alignment=12:       283.59 ( 12.87%)	      621.25 (-90.88%)	      208.12 ( 36.05%)	      293.75 (  9.75%)	      277.34 ( 14.79%)	      325.47
           length=2730, alignment=12:       202.66 (  8.85%)	      424.06 (-90.72%)	      142.34 ( 35.98%)	      203.91 (  8.29%)	      201.88 (  9.21%)	      222.34
---
 sysdeps/aarch64/multiarch/strlen.c       |  4 ++-
 sysdeps/aarch64/multiarch/strlen_asimd.S | 42 ++++++++++++++++++++------------
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
index 1db01babeec..abf6513eeea 100644
--- a/sysdeps/aarch64/multiarch/strlen.c
+++ b/sysdeps/aarch64/multiarch/strlen.c
@@ -34,7 +34,9 @@  extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
 
 libc_ifunc (__strlen,
-	    (USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic));
+	    (USE_ASIMD_STRLEN () || IS_KUNPENG(midr)
+	    ? __strlen_asimd
+	    :__strlen_generic));
 
 # undef strlen
 strong_alias (__strlen, strlen);
diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
index 1d1c6abb825..1de6cd3a173 100644
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
@@ -48,6 +48,9 @@ 
 #define dataq2		q3
 #define datav2		v3
 
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
 #ifdef TEST_PAGE_CROSS
 # define MIN_PAGE_SIZE 16
 #else
@@ -82,40 +85,47 @@  ENTRY_ALIGN (__strlen_asimd, 6)
 	DELOUSE (0)
 	DELOUSE (1)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	mov	zeroones, REP8_01
 	cmp	tmp1, MIN_PAGE_SIZE - 16
 	b.gt	L(page_cross)
-	ldr	dataq, [srcin]
+	ldp	data1, data2, [srcin]
 #ifdef __AARCH64EB__
-	rev64	datav.16b, datav.16b
+	rev	data1, data1
+	rev	data2, data2
 #endif
 
-	/* Get the minimum value and keep going if it is not zero.  */
-	uminv	datab2, datav.16b
-	mov	tmp1, datav2.d[0]
-	cbnz	tmp1, L(main_loop_entry)
-
-	cmeq	datav.16b, datav.16b, #0
-	mov	data1, datav.d[0]
-	mov	data2, datav.d[1]
-	cmp	data1, 0
-	csel	data1, data1, data2, ne
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(main_loop_entry)
+	csel	has_nul1, has_nul1, has_nul2, cc
 	mov	len, 8
-	rev	data1, data1
-	clz	tmp1, data1
-	csel	len, xzr, len, ne
+	rev	has_nul1, has_nul1
+	clz	tmp1, has_nul1
+	csel	len, xzr, len, cc
 	add	len, len, tmp1, lsr 3
 	ret
 
 L(main_loop_entry):
 	bic	src, srcin, 15
+	sub	src, src, 16
 
 L(main_loop):
-	ldr	dataq, [src, 16]!
+	ldr	dataq, [src, 32]!
 L(page_cross_entry):
 	/* Get the minimum value and keep going if it is not zero.  */
 	uminv	datab2, datav.16b
 	mov	tmp1, datav2.d[0]
+	cbz	tmp1, L(tail)
+	ldr	dataq, [src, 16]
+	uminv	datab2, datav.16b
+	mov	tmp1, datav2.d[0]
 	cbnz	tmp1, L(main_loop)
+	add	src, src, 16
 
 L(tail):
 #ifdef __AARCH64EB__