[3/3] aarch64: Optimized memchr specific to AmpereComputing skylark

Message ID PS1PR0201MB1771610330EC94F7A9EB1CDE84E20@PS1PR0201MB1771.apcprd02.prod.outlook.com
State New, archived
Headers

Commit Message

Xue Feng Oct. 12, 2018, 11:44 a.m. UTC
  This version uses general register based memory instruction to load
data, because vector register based is slightly slower in skylark.

Character-matching is performed on 16-byte (both size and alignment)
memory block in parallel each iteration.

Another minor optimization is to preload next 16-byte block at the
begin of each iteration, which can hide some memory access latency.

    * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
    [!MEMCHR](MEMCHR): Set to __memchr.
    * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
    Add memchr_generic and memchr_skylark.
    * sysdeps/aarch64/multiarch/ifunc-impl-list.c
    (__libc_ifunc_impl_list): Add memchr ifuncs.
    * sysdeps/aarch64/multiarch/memchr.c: New file.
    * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
    * sysdeps/aarch64/multiarch/memchr_skylark.S: Likewise.
---
 ChangeLog                                   |  12 ++
 sysdeps/aarch64/memchr.S                    |  10 +-
 sysdeps/aarch64/multiarch/Makefile          |   1 +
 sysdeps/aarch64/multiarch/ifunc-impl-list.c |   3 +
 sysdeps/aarch64/multiarch/memchr.c          |  41 +++++
 sysdeps/aarch64/multiarch/memchr_generic.S  |  33 +++++
 sysdeps/aarch64/multiarch/memchr_skylark.S  | 222 ++++++++++++++++++++++++++++
 7 files changed, 319 insertions(+), 3 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/memchr.c
 create mode 100644 sysdeps/aarch64/multiarch/memchr_generic.S
 create mode 100644 sysdeps/aarch64/multiarch/memchr_skylark.S
  

Comments

Richard Henderson Oct. 12, 2018, 4:18 p.m. UTC | #1
On 10/12/18 4:44 AM, Xue Feng wrote:
> +L(loop):
> +	/*
> +	 * Preload the next 16-byte aligned block to hide some memory
> +	 * access latency.
> +	 */
> +	ldp	data1, data2, [src, 16]!

You seem to be under the mistaken impression that you are allowed to search all
cntin bytes.  However, memchr "shall behave as if it reads the characters
sequentially and stops as soon as a matching character is found."


r~
  
Xue Feng Oct. 15, 2018, 3:20 a.m. UTC | #2
Yes, I missed one situation. If the next block is just the start of a new page with no access, this load will cause a segfault.

Thanks,
Feng
  
Adhemerval Zanella Netto Oct. 15, 2018, 11:34 a.m. UTC | #3
If we don't have a test for it, please add one along with patch submission.

On 15/10/2018 00:20, Xue Feng wrote:
> Yes, I missed one situation. If the next block is just the start of a new page with no access, this load will cause a segfault.
> 
> Thanks,
> Feng
> 
> ________________________________________
> From: Richard Henderson <rth7680@gmail.com> on behalf of Richard Henderson <rth@twiddle.net>
> Sent: Saturday, October 13, 2018 0:18
> To: Xue Feng; libc-alpha@sourceware.org
> Cc: marcus.shawcroft@linaro.org; szabolcs.nagy@arm.com; Feng Xue
> Subject: Re: [PATCH 3/3] aarch64: Optimized memchr specific to AmpereComputing skylark
> 
> On 10/12/18 4:44 AM, Xue Feng wrote:
> &gt; +L(loop):
> &gt; +     /*
> &gt; +      * Preload the next 16-byte aligned block to hide some memory
> &gt; +      * access latency.
> &gt; +      */
> &gt; +     ldp     data1, data2, [src, 16]!
> 
> You seem to be under the mistaken impression that you are allowed to search all
> cntin bytes.  However, memchr "shall behave as if it reads the characters
> sequentially and stops as soon as a matching character is found."
> 
> 
> r~
> </rth@twiddle.net></rth7680@gmail.com>
>
  

Patch

diff --git a/ChangeLog b/ChangeLog
index 28370f9..e64b8b3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@ 
+2018-10-13  Feng Xue  <feng.xue@amperecomputing.com>
+
+	* sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
+	[!MEMCHR](MEMCHR): Set to __memchr.
+	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
+	Add memchr_generic and memchr_skylark.
+	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add memchr ifuncs.
+	* sysdeps/aarch64/multiarch/memchr.c: New file.
+	* sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
+	* sysdeps/aarch64/multiarch/memchr_skylark.S: Likewise.
+
 2018-10-12  Feng Xue  <feng.xue@amperecomputing.com>
 
 	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
index e422aef..4afebd3 100644
--- a/sysdeps/aarch64/memchr.S
+++ b/sysdeps/aarch64/memchr.S
@@ -26,6 +26,10 @@ 
  * Neon Available.
  */
 
+#ifndef MEMCHR
+# define MEMCHR __memchr
+#endif
+
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
@@ -59,7 +63,7 @@ 
  * identify exactly which byte has matched.
  */
 
-ENTRY (__memchr)
+ENTRY (MEMCHR)
 	/* Do not dereference srcin if no bytes to compare.  */
 	cbz	cntin, L(zero_length)
 	/*
@@ -152,6 +156,6 @@  L(tail):
 L(zero_length):
 	mov	result, #0
 	ret
-END (__memchr)
-weak_alias (__memchr, memchr)
+END (MEMCHR)
+weak_alias (MEMCHR, memchr)
 libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 828ce4f..353ece7 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -2,5 +2,6 @@  ifeq ($(subdir),string)
 sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
 		   memcpy_falkor memmove_falkor \
 		   memset_generic memset_falkor memset_skylark \
+		   memchr_generic memchr_skylark \
 		   strlen_generic strlen_asimd
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index baf01a0..f5014d2 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -53,6 +53,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_skylark)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
+  IFUNC_IMPL (i, name, memchr,
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_skylark)
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_generic))
 
   IFUNC_IMPL (i, name, strlen,
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
diff --git a/sysdeps/aarch64/multiarch/memchr.c b/sysdeps/aarch64/multiarch/memchr.c
new file mode 100644
index 0000000..cbcf8b7
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memchr.c
@@ -0,0 +1,41 @@ 
+/* Multiple versions of memchr. AARCH64 version.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memchr so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memchr
+# define memchr __redirect_memchr
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memchr) __memchr;
+
+extern __typeof (__redirect_memchr) __memchr_generic attribute_hidden;
+extern __typeof (__redirect_memchr) __memchr_skylark attribute_hidden;
+
+libc_ifunc (__memchr,
+	    ((IS_SKYLARK (midr)
+	       ? __memchr_skylark
+	       : __memchr_generic)));
+
+# undef memchr
+strong_alias (__memchr, memchr);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memchr_generic.S b/sysdeps/aarch64/multiarch/memchr_generic.S
new file mode 100644
index 0000000..707148b
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memchr_generic.S
@@ -0,0 +1,33 @@ 
+/* Memchr for aarch64, default version for internal use.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMCHR __memchr_generic
+
+/* Do not hide the generic version of memchr, we use it internally.  */
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+
+/* Add a hidden definition for use within libc.so.  */
+# ifdef SHARED
+	.globl __GI_memchr; __GI_memchr = __memchr_generic
+# endif
+#endif
+
+# include "../memchr.S"
diff --git a/sysdeps/aarch64/multiarch/memchr_skylark.S b/sysdeps/aarch64/multiarch/memchr_skylark.S
new file mode 100644
index 0000000..9277104
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memchr_skylark.S
@@ -0,0 +1,222 @@ 
+/* Optimized memchr for AmpereComputing skylark processor.
+
+   Copyright (C) 2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+# define MEMCHR __memchr_skylark
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		x1
+#define cntin		x2
+
+#define result		x0
+
+#define repchr		x1
+
+#define tmp1		x2
+#define tmp2		x3
+#define tmp3		x4
+#define tmp4		x5
+
+#define src		x6
+#define srcend		x7
+#define srcend16	x8
+
+#define zeroones	x9
+
+#define data1		x10
+#define data2		x11
+#define data1a		x12
+#define data2a		x13
+
+#define has_chr1	x14
+#define has_chr2	x15
+
+#define REP8_01		0x0101010101010101
+#define REP8_7f		0x7f7f7f7f7f7f7f7f
+
+ENTRY_ALIGN (MEMCHR, 6)
+
+	DELOUSE (0)
+	DELOUSE (2)
+
+	/* Do not dereference srcin if no bytes to compare. */
+	cbz	cntin, L(none_chr)
+
+	bic	src, srcin, 15
+	/*
+	 * Load the first 16-byte aligned block where memory to
+	 * search starts.
+	 */
+	ldp	data1, data2, [src]
+
+	mov	zeroones, REP8_01
+	and	repchr, chrin, 255
+	/* Generate a qword integer as |c|c|c|c|c|c|c|c|. */
+	mul	repchr, repchr, zeroones
+
+	add	srcend, srcin, cntin
+	/* srcend16 is begin address of last 16-byte aligned block. */
+	sub	srcend, srcend, 1
+	bic	srcend16, srcend, 15
+
+	lsl	tmp1, srcin, 3
+	mov	tmp2, ~0
+#ifdef __AARCH64EB__
+	lsr	tmp3, tmp2, tmp1
+#else
+	lsl	tmp3, tmp2, tmp1
+#endif
+	/* Real starting point is in the first or the second qword? */ 
+	tst	srcin, 8
+
+	/*
+	 * Transform any byte in the 16-byte block, which equals to the
+         * specified char, to zero using xor operation. In this way,
+	 * searching the char becomes  detecting zero in the resulting
+	 * two qword integers.
+	 */
+	eor	data1a, data1, repchr
+	eor	data2a, data2, repchr
+
+	/*
+	 * Set those unused bytes(before the real starting point) to
+	 * 0xff, so that they will not hit any zero detection.
+	 */
+	orn	tmp1, data1a, tmp3
+	orn	tmp2, data2a, tmp3
+
+	csinv	data1a, tmp1, xzr, eq 
+	csel	data2a, data2a, tmp2, eq
+
+	cmp	src, srcend16
+	b.eq	L(last)
+
+L(loop):
+	/*
+	 * Preload the next 16-byte aligned block to hide some memory
+	 * access latency.
+	 */
+	ldp	data1, data2, [src, 16]!
+
+	/*
+	 * Use the following integer test to find out if any byte in a
+	 * qword is zero. If do not contain zero-valued byte, test result
+	 * is zero.
+	 *
+	 *  (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080
+	 * =
+	 *  (qword - 0x0101010101010101) & ~(qword  | 0x7f7f7f7f7f7f7f7f)
+	 *
+	 */
+	sub	tmp1, data1a, zeroones
+	sub	tmp2, data2a, zeroones
+
+	orr	tmp3, data1a, REP8_7f
+	orr	tmp4, data2a, REP8_7f
+
+	bics	has_chr1, tmp1, tmp3
+	bic	has_chr2, tmp2, tmp4
+
+	ccmp	has_chr2, 0, 0, eq	/* NZCV = 0000 */
+	ccmp	src, srcend16, 4, eq	/* NZCV = 0100 */
+
+	/*
+	 * Transform any byte in next 16-byte aligned block, which equals
+         * to the specified char, to zero using xor operation. 
+	 */
+	eor	data1a, data1, repchr
+	eor	data2a, data2, repchr
+
+	/*
+	 * Here use equal-or-not comparison, instead of less/more, to
+	 * make it work even for case of address wrap-around.
+	 */
+	b.ne	L(loop)
+
+	cbz	has_chr1, 1f
+	sub	src, src, 16
+#ifdef __AARCH64EB__
+	/*
+	 * For big-endian, can not directly use has_chr1/has_chr2 because
+	 * two qwords has been reversed after loading from memory.
+	 * Thus, one way is to restore two qwords in some means, and
+	 * byte-swap them, then perform char detection on them.
+	 */
+	add	data2a, tmp1, zeroones
+	b	L(find_chr)
+#else
+	b	L(done)
+#endif
+
+1:	cbz	has_chr2, L(last)
+	sub	src, src, 8
+#ifdef __AARCH64EB__
+	add	data2a, tmp2, zeroones
+	b	L(find_chr)
+#else
+	mov	has_chr1, has_chr2
+	b	L(done)
+#endif
+
+L(last):
+#ifdef __AARCH64EB__
+	rev	data1a, data1a
+#endif
+	sub	tmp1, data1a, zeroones
+	orr	tmp3, data1a, REP8_7f
+	bics	has_chr1, tmp1, tmp3
+	b.ne    L(done)
+
+	add	src, src, 8
+#ifdef __AARCH64EB__
+L(find_chr):
+	rev	data2a, data2a
+#endif
+	sub	tmp1, data2a, zeroones
+	orr	tmp3, data2a, REP8_7f
+	bics	has_chr1, tmp1, tmp3
+	b.eq	L(none_chr)
+
+L(done):
+	/*
+	 * If the specified char is found in a qword, the corresponding
+	 * byte of in has_chr has value of 1, while this is only true for
+	 * the first occurence not all.
+	 */
+	rev	has_chr1, has_chr1
+	bic	tmp1, src, 15
+	cmp	tmp1, srcend16	/* At the last 16-byte aligned block? */
+	clz	tmp2, has_chr1
+	add	src, src, tmp2, lsr 3
+	ccmp	src, srcend, 0, eq	/* NZCV = 0000 */
+	csel	result, src, xzr, ls
+	ret
+
+L(none_chr):
+	mov	result, 0
+	ret
+
+END (MEMCHR)
+libc_hidden_builtin_def (MEMCHR)
+
+#endif