aarch64: Optimized strlen for Kunpeng processor

Message ID 20191017145223.20728-1-zhangxuelei4@huawei.com
State Superseded
Headers

Commit Message

Xuelei Zhang Oct. 17, 2019, 2:52 p.m. UTC
  Optimize the strlen implementation by using vector operations and
loop unrooling in main loop.Compared to __strlen_generic,it reduces
latency of cases in bench-strlen by 7%~18% when the length of src
is greater than 128 bytes, with gains throughout the benchmark.

Here is the result:

Function: strlen
Variant:
                                    builtin_strlen	generic_strlen	memchr_strlen	__strlen_asimd	__strlen_kunpeng	__strlen_generic
  

Patch

========================================================================================================================
               length=1, alignment=1:        20.00 (-64.10%)	       14.38 (-17.95%)	       16.25 (-33.33%)	       11.56 (  5.13%)	       11.72 (  3.85%)	       12.19
               length=1, alignment=0:        15.00 (-26.32%)	       12.66 ( -6.58%)	       16.09 (-35.53%)	       12.19 ( -2.63%)	       12.03 ( -1.32%)	       11.88
               length=2, alignment=2:        15.16 (-25.97%)	       14.06 (-16.88%)	       15.62 (-29.87%)	       12.03 (  0.00%)	       11.72 (  2.60%)	       12.03
               length=2, alignment=0:        14.53 (-20.78%)	       12.81 ( -6.49%)	       16.25 (-35.07%)	       12.66 ( -5.19%)	       12.03 (  0.00%)	       12.03
               length=3, alignment=3:        15.00 (-21.52%)	       14.38 (-16.46%)	       15.78 (-27.85%)	       12.03 (  2.53%)	       12.03 (  2.53%)	       12.34
               length=3, alignment=0:        14.53 (-24.00%)	       12.66 ( -8.00%)	       16.88 (-44.00%)	       12.19 ( -4.00%)	       12.03 ( -2.67%)	       11.72
               length=4, alignment=4:        14.69 (-23.68%)	       15.62 (-31.58%)	       16.25 (-36.84%)	       11.88 (  0.00%)	       11.72 (  1.32%)	       11.88
               length=4, alignment=0:        14.84 (-20.25%)	       12.66 ( -2.53%)	       16.72 (-35.44%)	       11.88 (  3.80%)	       12.19 (  1.27%)	       12.34
               length=5, alignment=5:        14.38 (-21.05%)	       14.84 (-25.00%)	       15.62 (-31.58%)	       11.88 (  0.00%)	       11.72 (  1.32%)	       11.88
               length=5, alignment=0:        14.84 (-21.80%)	       13.12 ( -7.69%)	       16.41 (-34.61%)	       12.03 (  1.28%)	       11.88 (  2.56%)	       12.19
               length=6, alignment=6:        14.69 (-25.33%)	       14.69 (-25.33%)	       15.78 (-34.67%)	       11.88 ( -1.33%)	       11.88 ( -1.33%)	       11.72
               length=6, alignment=0:        14.69 (-23.68%)	       13.28 (-11.84%)	       16.41 (-38.16%)	       12.66 ( -6.58%)	       12.34 ( -3.95%)	       11.88
               length=7, alignment=7:        14.84 (-23.38%)	       13.28 (-10.39%)	       15.78 (-31.17%)	       12.19 ( -1.30%)	       12.03 (  0.00%)	       12.03
               length=7, alignment=0:        14.53 (-19.23%)	       12.81 ( -5.13%)	       16.25 (-33.33%)	       12.03 (  1.28%)	       12.03 (  1.28%)	       12.19
               length=4, alignment=0:        14.69 (-25.33%)	       12.81 ( -9.33%)	       15.94 (-36.00%)	       11.72 (  0.00%)	       11.88 ( -1.33%)	       11.72
               length=4, alignment=7:        14.69 (-22.08%)	       13.28 (-10.39%)	       15.94 (-32.47%)	       12.03 (  0.00%)	       12.03 (  0.00%)	       12.03
               length=4, alignment=2:        15.00 (-28.00%)	       15.31 (-30.67%)	       16.09 (-37.33%)	       11.88 ( -1.33%)	       12.03 ( -2.67%)	       11.72
               length=2, alignment=2:        14.69 (-23.68%)	       14.06 (-18.42%)	       15.78 (-32.89%)	       12.03 ( -1.32%)	       12.03 ( -1.32%)	       11.88
               length=8, alignment=0:        14.84 (-26.67%)	       14.53 (-24.00%)	       16.09 (-37.33%)	       12.03 ( -2.67%)	       11.72 (  0.00%)	       11.72
               length=8, alignment=7:        14.22 (-19.74%)	       12.97 ( -9.21%)	       15.94 (-34.21%)	       12.03 ( -1.32%)	       11.72 (  1.32%)	       11.88
               length=8, alignment=3:        14.84 (-25.00%)	       17.19 (-44.74%)	       15.78 (-32.89%)	       11.88 (  0.00%)	       11.72 (  1.32%)	       11.88
               length=5, alignment=3:        15.00 (-24.68%)	       15.16 (-25.97%)	       15.94 (-32.47%)	       11.88 (  1.30%)	       12.03 (  0.00%)	       12.03
              length=16, alignment=0:        16.41 (-17.98%)	       15.47 (-11.24%)	       16.09 (-15.73%)	       12.19 ( 12.36%)	       13.59 (  2.25%)	       13.91
              length=16, alignment=7:        16.25 (-14.29%)	       15.62 ( -9.89%)	       16.09 (-13.19%)	       12.34 ( 13.19%)	       13.44 (  5.49%)	       14.22
              length=16, alignment=4:        16.09 (-17.05%)	       17.19 (-25.00%)	       15.62 (-13.64%)	       12.03 ( 12.50%)	       13.59 (  1.14%)	       13.75
              length=10, alignment=4:        15.31 (-27.27%)	       16.41 (-36.36%)	       15.78 (-31.17%)	       11.88 (  1.30%)	       12.50 ( -3.90%)	       12.03
              length=32, alignment=0:        15.94 ( -5.15%)	       18.28 (-20.62%)	       18.59 (-22.68%)	       14.22 (  6.18%)	       13.44 ( 11.34%)	       15.16
              length=32, alignment=7:        15.16 ( -4.30%)	       18.44 (-26.88%)	       17.19 (-18.28%)	       12.81 ( 11.83%)	       13.12 (  9.68%)	       14.53
              length=32, alignment=5:        15.31 ( -7.69%)	       20.94 (-47.25%)	       16.41 (-15.38%)	       12.34 ( 13.19%)	       12.81 (  9.89%)	       14.22
              length=21, alignment=5:        16.09 (-17.05%)	       18.28 (-32.95%)	       15.94 (-15.91%)	       12.03 ( 12.50%)	       13.12 (  4.55%)	       13.75
              length=64, alignment=0:        18.59 ( -4.39%)	       23.12 (-29.82%)	       19.22 ( -7.90%)	       15.62 ( 12.28%)	       15.94 ( 10.53%)	       17.81
              length=64, alignment=7:        18.12 (-10.48%)	       23.91 (-45.71%)	       19.69 (-20.00%)	       14.69 ( 10.48%)	       14.53 ( 11.43%)	       16.41
              length=64, alignment=6:        17.19 ( -1.85%)	       23.12 (-37.04%)	       24.06 (-42.59%)	       14.69 ( 12.96%)	       14.53 ( 13.89%)	       16.88
              length=42, alignment=6:        18.91 (-16.35%)	       20.16 (-24.04%)	       17.19 ( -5.77%)	       14.06 ( 13.46%)	       15.94 (  1.92%)	       16.25
             length=128, alignment=0:        21.09 (  4.25%)	       32.81 (-48.94%)	       21.72 (  1.42%)	       19.22 ( 12.77%)	       19.22 ( 12.77%)	       22.03
             length=128, alignment=7:        19.38 ( 10.14%)	       32.66 (-51.45%)	       21.72 ( -0.72%)	       19.22 ( 10.87%)	       18.44 ( 14.49%)	       21.56
             length=128, alignment=7:        18.75 ( 12.41%)	       31.09 (-45.26%)	       19.69 (  8.03%)	       19.22 ( 10.22%)	       18.44 ( 13.87%)	       21.41
              length=85, alignment=7:        21.72 (-17.80%)	       26.56 (-44.07%)	       24.22 (-31.36%)	       17.03 (  7.63%)	       16.56 ( 10.17%)	       18.44
             length=256, alignment=0:        30.16 (  3.50%)	       64.22 (-105.50%)	       25.94 ( 17.00%)	       26.88 ( 14.00%)	       26.56 ( 15.00%)	       31.25
             length=256, alignment=7:        28.75 (  7.07%)	       51.25 (-65.66%)	       28.75 (  7.07%)	       27.19 ( 12.12%)	       27.66 ( 10.61%)	       30.94
             length=256, alignment=8:        29.06 (  5.58%)	       65.47 (-112.69%)	       25.62 ( 16.75%)	       27.03 ( 12.18%)	       27.81 (  9.64%)	       30.78
             length=170, alignment=8:        24.53 (  4.85%)	       38.28 (-48.48%)	       22.66 ( 12.12%)	       23.59 (  8.48%)	       22.19 ( 13.94%)	       25.78
             length=512, alignment=0:        45.47 (  9.91%)	       94.22 (-86.69%)	       37.50 ( 25.70%)	       43.75 ( 13.31%)	       43.44 ( 13.93%)	       50.47
             length=512, alignment=7:        44.84 ( 10.03%)	       94.22 (-89.03%)	       38.28 ( 23.20%)	       43.91 ( 11.91%)	       44.06 ( 11.60%)	       49.84
             length=512, alignment=9:        44.53 ( 11.49%)	       97.03 (-92.86%)	       37.97 ( 24.53%)	       43.44 ( 13.66%)	       43.91 ( 12.73%)	       50.31
             length=341, alignment=9:        35.94 (  8.37%)	       71.72 (-82.87%)	       30.62 ( 21.91%)	       32.19 ( 17.93%)	       34.38 ( 12.35%)	       39.22
            length=1024, alignment=0:        78.75 ( 11.27%)	      168.28 (-89.61%)	       61.09 ( 31.16%)	      103.12 (-16.20%)	       76.41 ( 13.91%)	       88.75
            length=1024, alignment=7:        76.88 ( 11.83%)	      168.28 (-93.01%)	       62.03 ( 28.85%)	      105.94 (-21.51%)	       77.50 ( 11.11%)	       87.19
           length=1024, alignment=10:        77.81 ( 11.23%)	      170.78 (-94.83%)	       61.88 ( 29.41%)	      102.66 (-17.11%)	       77.66 ( 11.41%)	       87.66
            length=682, alignment=10:        60.31 (  9.18%)	      125.94 (-89.65%)	       45.31 ( 31.76%)	       55.16 ( 16.94%)	       58.44 ( 12.00%)	       66.41
            length=2048, alignment=0:       145.94 ( 13.84%)	      316.09 (-86.62%)	      110.78 ( 34.59%)	      143.59 ( 15.22%)	      144.69 ( 14.58%)	      169.38
            length=2048, alignment=7:       145.31 ( 16.44%)	      316.09 (-81.76%)	      111.09 ( 36.12%)	      144.53 ( 16.89%)	      143.28 ( 17.61%)	      173.91
           length=2048, alignment=11:       144.84 ( 16.86%)	      319.38 (-83.32%)	      111.25 ( 36.14%)	      144.38 ( 17.13%)	      143.59 ( 17.58%)	      174.22
           length=1365, alignment=11:       101.41 ( 17.01%)	      221.41 (-81.20%)	       78.59 ( 35.68%)	      100.94 ( 17.39%)	      100.78 ( 17.52%)	      122.19
            length=4096, alignment=0:       280.00 ( 10.62%)	      617.19 (-97.01%)	      221.88 ( 29.18%)	      301.41 (  3.79%)	      278.44 ( 11.12%)	      313.28
            length=4096, alignment=7:       283.75 ( 12.61%)	      618.44 (-90.47%)	      208.12 ( 35.90%)	      292.34 (  9.96%)	      277.81 ( 14.44%)	      324.69
           length=4096, alignment=12:       283.59 ( 12.87%)	      621.25 (-90.88%)	      208.12 ( 36.05%)	      293.75 (  9.75%)	      277.34 ( 14.79%)	      325.47
           length=2730, alignment=12:       202.66 (  8.85%)	      424.06 (-90.72%)	      142.34 ( 35.98%)	      203.91 (  8.29%)	      201.88 (  9.21%)	      222.34
---
 sysdeps/aarch64/multiarch/Makefile          |   2 +-
 sysdeps/aarch64/multiarch/ifunc-impl-list.c |   1 +
 sysdeps/aarch64/multiarch/strlen.c          |   7 +-
 sysdeps/aarch64/multiarch/strlen_kunpeng.S  | 178 ++++++++++++++++++++++++++++
 4 files changed, 186 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/strlen_kunpeng.S

diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 4150b89a90..b24325ca01 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -3,5 +3,5 @@  sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
 		   memcpy_falkor memmove_falkor \
 		   memset_generic memset_falkor memset_emag \
 		   memchr_generic memchr_nosimd \
-		   strlen_generic strlen_asimd
+		   strlen_generic strlen_asimd strlen_kunpeng
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index be13b916e5..b476f09a44 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -60,6 +60,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   IFUNC_IMPL (i, name, strlen,
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_kunpeng)
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
 
   return i;
diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
index 1db01babee..2c5d2c511b 100644
--- a/sysdeps/aarch64/multiarch/strlen.c
+++ b/sysdeps/aarch64/multiarch/strlen.c
@@ -32,9 +32,14 @@  extern __typeof (__redirect_strlen) __strlen;
 
 extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
+extern __typeof (__redirect_strlen) __strlen_kunpeng attribute_hidden;
 
 libc_ifunc (__strlen,
-	    (USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic));
+	    (USE_ASIMD_STRLEN ()
+	    ? __strlen_asimd
+	    : (IS_KUNPENG(midr)
+	    ? __strlen_kunpeng
+	    :__strlen_generic)));
 
 # undef strlen
 strong_alias (__strlen, strlen);
diff --git a/sysdeps/aarch64/multiarch/strlen_kunpeng.S b/sysdeps/aarch64/multiarch/strlen_kunpeng.S
new file mode 100644
index 0000000000..fef312cc5c
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/strlen_kunpeng.S
@@ -0,0 +1,178 @@ 
+/* Optimized strlen for Huawei Kunpeng processor.
+   Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+
+   ARMv8-a, AArch64, ASIMD, unaligned accesses, min page size 4k.  */
+
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.  */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define len		x0
+
+/* Locals and temporaries.  */
+#define src		x1
+#define data1		x2
+#define data2		x3
+#define has_nul1	x4
+#define has_nul2	x5
+#define tmp1		x4
+#define tmp2		x5
+#define tmp3		x6
+#define tmp4		x7
+#define zeroones	x8
+#define dataq		q2
+#define datav		v2
+#define datab2		b3
+#define dataq2		q3
+#define datav2		v3
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+#ifdef TEST_PAGE_CROSS
+# define MIN_PAGE_SIZE 16
+#else
+# define MIN_PAGE_SIZE 4096
+#endif
+
+	/* Since strings are short on average, we check the first 16 bytes
+	   of the string for a NUL character.  In order to do an unaligned load
+	   safely we have to do a page cross check first.  If there is a NUL
+	   byte we calculate the length from the 2 8-byte words using
+	   conditional select to reduce branch mispredictions (it is unlikely
+	   strlen_asimd will be repeatedly called on strings with the same
+	   length).
+
+	   If the string is longer than 16 bytes, we align src so don't need
+	   further page cross checks, and process 16 bytes per iteration.
+
+	   If the page cross check fails, we read 16 bytes from an aligned
+	   address, remove any characters before the string, and continue
+	   in the main loop using aligned loads.  Since strings crossing a
+	   page in the first 16 bytes are rare (probability of
+	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+	   AArch64 systems have a minimum page size of 4k.  We don't bother
+	   checking for larger page sizes - the cost of setting up the correct
+	   page size is just not worth the extra gain from a small reduction in
+	   the cases taking the slow path.  Note that we only care about
+	   whether the first fetch, which may be misaligned, crosses a page
+	   boundary.  */
+
+ENTRY_ALIGN (__strlen_kunpeng, 6)
+	DELOUSE (0)
+	DELOUSE (1)
+	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	mov	zeroones, REP8_01
+	cmp	tmp1, MIN_PAGE_SIZE - 16
+	b.gt	L(page_cross)
+	ldp	data1, data2, [srcin]
+#ifdef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(main_loop_entry)
+	csel	has_nul1, has_nul1, has_nul2, cc
+	mov	len, 8
+	rev	has_nul1, has_nul1
+	clz	tmp1, has_nul1
+	csel	len, xzr, len, cc
+	add	len, len, tmp1, lsr 3
+	ret
+
+L(main_loop_entry):
+	bic	src, srcin, 15
+	sub	src, src, 16
+
+L(main_loop):
+	ldr	dataq, [src, 32]!
+L(page_cross_entry):
+	/* Get the minimum value and keep going if it is not zero.  */
+	uminv	datab2, datav.16b
+	mov	tmp1, datav2.d[0]
+	cbz	tmp1, L(tail)
+	ldr	dataq, [src, 16]
+	uminv	datab2, datav.16b
+	mov	tmp1, datav2.d[0]
+	cbnz	tmp1, L(main_loop)
+	add	src, src, 16
+
+L(tail):
+#ifdef __AARCH64EB__
+	rev64	datav.16b, datav.16b
+#endif
+	/* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
+	   pair of scalars and then compute the length from the earliest NULL
+	   byte.  */
+	cmeq	datav.16b, datav.16b, #0
+	mov	data1, datav.d[0]
+	mov	data2, datav.d[1]
+	cmp	data1, 0
+	csel	data1, data1, data2, ne
+	sub	len, src, srcin
+	rev	data1, data1
+	add	tmp2, len, 8
+	clz	tmp1, data1
+	csel	len, len, tmp2, ne
+	add	len, len, tmp1, lsr 3
+	ret
+
+	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+	   srcin to 0xff, so we ignore any NUL bytes before the string.
+	   Then continue in the aligned loop.  */
+L(page_cross):
+	mov	tmp3, 63
+	bic	src, srcin, 15
+	and	tmp1, srcin, 7
+	ands	tmp2, srcin, 8
+	ldr	dataq, [src]
+	lsl	tmp1, tmp1, 3
+	csel	tmp2, tmp2, tmp1, eq
+	csel	tmp1, tmp1, tmp3, eq
+	mov	tmp4, -1
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsr	tmp1, tmp4, tmp1
+	lsr	tmp2, tmp4, tmp2
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsl	tmp1, tmp4, tmp1
+	lsl	tmp2, tmp4, tmp2
+#endif
+	mov	datav2.d[0], tmp1
+	mov	datav2.d[1], tmp2
+	orn	datav.16b, datav.16b, datav2.16b
+	b	L(page_cross_entry)
+END (__strlen_kunpeng)
+weak_alias (__strlen_kunpeng, strlen_kunpeng)
+libc_hidden_builtin_def (strlen_kunpeng)