[2/3] aarch64: Optimized memchr specific to AmpereComputing emag

Message ID BL0PR01MB4593146ADFE218DF380815BCF7BD0@BL0PR01MB4593.prod.exchangelabs.com
State New, archived
Headers

Commit Message

Feng Xue OS Dec. 18, 2018, 10:07 a.m. UTC
  simple_memchr	__memchr_base	__memchr_generic
Length 2048, position   32, alignment  0:	61.25	24.6875	29.375
Length  256, position   64, alignment  1:	85.625	27.8125	37.5
Length 2048, position   32, alignment  0:	49.0625	20.3125	27.5
Length  256, position   64, alignment  1:	85	28.125	36.875
Length 2048, position   64, alignment  0:	85.3125	25.9375	35.9375
Length  256, position   64, alignment  2:	84.6875	26.5625	36.5625
Length 2048, position   64, alignment  0:	84.6875	25.625	35
Length  256, position   64, alignment  2:	85.3125	26.5625	36.5625
Length 2048, position  128, alignment  0:	157.188	35	50.625
Length  256, position   64, alignment  3:	84.6875	26.5625	35.9375
Length 2048, position  128, alignment  0:	157.812	34.6875	50
Length  256, position   64, alignment  3:	85.3125	26.5625	35.9375
Length 2048, position  256, alignment  0:	302.812	54.0625	82.1875
Length  256, position   64, alignment  4:	85.625	26.5625	36.25
Length 2048, position  256, alignment  0:	303.125	53.125	81.5625
Length  256, position   64, alignment  4:	85.9375	26.25	35.9375
Length 2048, position  512, alignment  0:	593.125	90.625	142.5
Length  256, position   64, alignment  5:	85.625	26.5625	36.25
Length 2048, position  512, alignment  0:	592.5	90.3125	142.5
Length  256, position   64, alignment  5:	85.625	26.875	36.25
Length 2048, position 1024, alignment  0:	1184.06	165.938	265.938
Length  256, position   64, alignment  6:	85.3125	26.5625	36.5625
Length 2048, position 1024, alignment  0:	1180	165.625	265
Length  256, position   64, alignment  6:	85.3125	26.5625	36.25
Length 2048, position 2048, alignment  0:	2339.06	312.812	502.188
Length  256, position   64, alignment  7:	85.625	26.875	36.5625
Length 2048, position 2048, alignment  0:	2338.75	310.625	501.25
Length  256, position   64, alignment  7:	85.3125	26.25	35.625
Length    2, position    1, alignment  0:	14.6875	15.9375	18.4375
Length    2, position    1, alignment  0:	14.375	15.9375	18.125
Length    2, position    1, alignment  1:	14.0625	16.5625	22.8125
Length    2, position    1, alignment  1:	14.375	16.5625	23.125
Length    3, position    2, alignment  0:	15.625	16.25	18.4375
Length    3, position    2, alignment  0:	15.3125	15.625	17.8125
Length    3, position    2, alignment  2:	15	16.5625	22.8125
Length    3, position    2, alignment  2:	15	16.875	22.5
Length    4, position    3, alignment  0:	17.1875	15.3125	18.4375
Length    4, position    3, alignment  0:	16.25	15.625	18.125
Length    4, position    3, alignment  3:	16.25	16.25	22.8125
Length    4, position    3, alignment  3:	15.9375	15.9375	23.125
Length    5, position    4, alignment  0:	18.75	15.625	18.125
Length    5, position    4, alignment  0:	18.4375	15.9375	17.8125
Length    5, position    4, alignment  4:	18.125	17.8125	22.8125
Length    5, position    4, alignment  4:	18.75	17.1875	22.8125
Length    6, position    5, alignment  0:	19.6875	15.625	17.8125
Length    6, position    5, alignment  0:	19.0625	15.3125	18.125
Length    6, position    5, alignment  5:	19.375	17.1875	22.8125
Length    6, position    5, alignment  5:	19.375	17.1875	22.8125
Length    7, position    6, alignment  0:	19.375	15.9375	17.8125
Length    7, position    6, alignment  0:	19.0625	15.625	18.125
Length    7, position    6, alignment  6:	18.75	16.875	23.125
Length    7, position    6, alignment  6:	19.0625	16.875	22.5
Length    8, position    7, alignment  0:	20.3125	15.625	18.125
Length    8, position    7, alignment  0:	20	15.625	18.4375
Length    8, position    7, alignment  7:	19.375	17.1875	22.5
Length    8, position    7, alignment  7:	19.6875	16.875	22.8125
Length    9, position    8, alignment  0:	21.875	16.5625	17.8125
Length    9, position    8, alignment  0:	21.875	16.25	18.125
Length    9, position    8, alignment  0:	22.1875	15.9375	18.125
Length    9, position    8, alignment  0:	21.875	16.25	18.125
Length   10, position    9, alignment  0:	23.75	15.625	17.8125
Length   10, position    9, alignment  0:	23.125	15.625	18.4375
Length   10, position    9, alignment  1:	23.125	17.5	22.8125
Length   10, position    9, alignment  1:	28.125	17.1875	22.8125
Length   11, position   10, alignment  0:	23.4375	15.9375	17.8125
Length   11, position   10, alignment  0:	23.4375	15.9375	18.4375
Length   11, position   10, alignment  2:	23.75	17.1875	22.8125
Length   11, position   10, alignment  2:	23.4375	16.875	22.5
Length   12, position   11, alignment  0:	24.375	15.9375	17.8125
Length   12, position   11, alignment  0:	25	16.25	17.8125
Length   12, position   11, alignment  3:	24.375	17.1875	22.5
Length   12, position   11, alignment  3:	24.6875	17.1875	22.8125
Length   13, position   12, alignment  0:	25	15.9375	18.125
Length   13, position   12, alignment  0:	25.3125	15.625	17.8125
Length   13, position   12, alignment  4:	25.3125	20	22.8125
Length   13, position   12, alignment  4:	25	20	23.125
Length   14, position   13, alignment  0:	27.1875	15.9375	17.8125
Length   14, position   13, alignment  0:	27.1875	15.625	17.8125
Length   14, position   13, alignment  5:	26.875	19.6875	22.8125
Length   14, position   13, alignment  5:	27.1875	19.6875	22.8125
Length   15, position   14, alignment  0:	28.125	16.25	18.4375
Length   15, position   14, alignment  0:	28.125	15.625	17.8125
Length   15, position   14, alignment  6:	28.125	19.6875	22.5
Length   15, position   14, alignment  6:	27.8125	19.6875	22.8125
Length   16, position   15, alignment  0:	28.75	15.9375	18.125
Length   16, position   15, alignment  0:	29.375	15.9375	18.125
Length   16, position   15, alignment  7:	28.75	19.6875	22.8125
Length   16, position   15, alignment  7:	29.375	19.6875	22.8125
Length   17, position   16, alignment  0:	30.625	18.125	17.8125
Length   17, position   16, alignment  0:	30.3125	17.8125	18.125
Length   17, position   16, alignment  0:	30.9375	17.8125	17.8125
Length   17, position   16, alignment  0:	30.3125	17.5	18.125
Length   18, position   17, alignment  0:	32.1875	17.5	18.125
Length   18, position   17, alignment  0:	32.1875	17.8125	18.125
Length   18, position   17, alignment  1:	31.875	20	22.5
Length   18, position   17, alignment  1:	32.1875	19.6875	22.5
Length   19, position   18, alignment  0:	32.5	17.8125	18.125
Length   19, position   18, alignment  0:	32.5	17.5	17.5
Length   19, position   18, alignment  2:	32.1875	19.6875	22.5
Length   19, position   18, alignment  2:	32.1875	19.6875	22.5
Length   20, position   19, alignment  0:	33.125	17.5	17.8125
Length   20, position   19, alignment  0:	33.125	17.5	17.8125
Length   20, position   19, alignment  3:	33.125	19.375	22.8125
Length   20, position   19, alignment  3:	33.4375	19.6875	22.8125
Length   21, position   20, alignment  0:	35.625	17.5	18.125
Length   21, position   20, alignment  0:	35.3125	17.5	18.4375
Length   21, position   20, alignment  4:	34.6875	20.3125	22.1875
Length   21, position   20, alignment  4:	35.3125	19.6875	22.5
Length   22, position   21, alignment  0:	36.5625	17.5	17.5
Length   22, position   21, alignment  0:	36.25	17.1875	18.125
Length   22, position   21, alignment  5:	36.25	20	22.8125
Length   22, position   21, alignment  5:	36.5625	20.3125	22.8125
Length   23, position   22, alignment  0:	36.5625	17.5	18.125
Length   23, position   22, alignment  0:	36.5625	17.8125	18.125
Length   23, position   22, alignment  6:	36.5625	19.6875	22.8125
Length   23, position   22, alignment  6:	36.5625	20	22.8125
Length   24, position   23, alignment  0:	38.125	17.1875	18.125
Length   24, position   23, alignment  0:	37.8125	17.8125	18.125
Length   24, position   23, alignment  7:	37.5	20	22.5
Length   24, position   23, alignment  7:	37.8125	20	23.125
Length   25, position   24, alignment  0:	39.6875	18.4375	17.8125
Length   25, position   24, alignment  0:	39.6875	17.8125	18.125
Length   25, position   24, alignment  0:	39.6875	17.8125	18.4375
Length   25, position   24, alignment  0:	40	17.8125	18.125
Length   26, position   25, alignment  0:	40.9375	17.5	18.125
Length   26, position   25, alignment  0:	40.9375	17.8125	18.125
Length   26, position   25, alignment  1:	40.9375	20.3125	22.8125
Length   26, position   25, alignment  1:	41.25	20	22.8125
Length   27, position   26, alignment  0:	41.875	18.4375	17.8125
Length   27, position   26, alignment  0:	41.5625	18.125	17.8125
Length   27, position   26, alignment  2:	40.9375	19.6875	22.8125
Length   27, position   26, alignment  2:	41.25	20	22.8125
Length   28, position   27, alignment  0:	42.5	18.125	18.125
Length   28, position   27, alignment  0:	42.5	18.125	18.125
Length   28, position   27, alignment  3:	42.5	19.6875	23.125
Length   28, position   27, alignment  3:	42.5	19.6875	22.8125
Length   29, position   28, alignment  0:	44.375	17.8125	17.8125
Length   29, position   28, alignment  0:	44.375	18.125	17.8125
Length   29, position   28, alignment  4:	44.6875	21.5625	26.875
Length   29, position   28, alignment  4:	44.0625	21.25	27.1875
Length   30, position   29, alignment  0:	45.625	18.125	17.8125
Length   30, position   29, alignment  0:	45.3125	18.125	17.8125
Length   30, position   29, alignment  5:	45.625	21.25	26.5625
Length   30, position   29, alignment  5:	45.3125	20.625	27.1875
Length   31, position   30, alignment  0:	46.5625	18.125	17.8125
Length   31, position   30, alignment  0:	45.625	17.8125	17.8125
Length   31, position   30, alignment  6:	45.9375	21.25	26.875
Length   31, position   30, alignment  6:	45.625	21.25	27.1875
Length   32, position   31, alignment  0:	47.5	17.8125	18.125
Length   32, position   31, alignment  0:	47.1875	17.8125	18.4375
Length   32, position   31, alignment  7:	46.875	21.25	26.5625
Length   32, position   31, alignment  7:	47.1875	20.9375	27.1875

---

This version uses general register based memory instruction to load
data, because vector register based is slightly slower in emag.

Character-matching is performed on 16-byte (both size and alignment)
memory block in parallel each iteration.

    * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
    [!MEMCHR](MEMCHR): Set to __memchr.
    * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
    Add memchr_generic and memchr_base.
    * sysdeps/aarch64/multiarch/ifunc-impl-list.c
    (__libc_ifunc_impl_list): Add memchr ifuncs.
    * sysdeps/aarch64/multiarch/memchr.c: New file.
    * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
    * sysdeps/aarch64/multiarch/memchr_base.S: Likewise.
---
 ChangeLog                                   |  12 ++
 sysdeps/aarch64/memchr.S                    |  10 +-
 sysdeps/aarch64/multiarch/Makefile          |   1 +
 sysdeps/aarch64/multiarch/ifunc-impl-list.c |   3 +
 sysdeps/aarch64/multiarch/memchr.c          |  41 +++++
 sysdeps/aarch64/multiarch/memchr_base.S     | 223 ++++++++++++++++++++++++++++
 sysdeps/aarch64/multiarch/memchr_generic.S  |  33 ++++
 7 files changed, 320 insertions(+), 3 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/memchr.c
 create mode 100644 sysdeps/aarch64/multiarch/memchr_base.S
 create mode 100644 sysdeps/aarch64/multiarch/memchr_generic.S
  

Comments

Szabolcs Nagy Dec. 19, 2018, 3:41 p.m. UTC | #1
On 18/12/2018 10:07, Feng Xue wrote:
> This version uses general register based memory instruction to load

> data, because vector register based is slightly slower in emag.

> 

> Character-matching is performed on 16-byte (both size and alignment)

> memory block in parallel each iteration.

> 

>     * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.

>     [!MEMCHR](MEMCHR): Set to __memchr.

>     * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):

>     Add memchr_generic and memchr_base.

>     * sysdeps/aarch64/multiarch/ifunc-impl-list.c

>     (__libc_ifunc_impl_list): Add memchr ifuncs.

>     * sysdeps/aarch64/multiarch/memchr.c: New file.

>     * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.

>     * sysdeps/aarch64/multiarch/memchr_base.S: Likewise.


OK to commit.
  
Siddhesh Poyarekar Dec. 20, 2018, 9:24 a.m. UTC | #2
On 18/12/18 3:37 PM, Feng Xue wrote:
> This version uses general register based memory instruction to load
> data, because vector register based is slightly slower in emag.
> 
> Character-matching is performed on 16-byte (both size and alignment)
> memory block in parallel each iteration.
> 
>      * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
>      [!MEMCHR](MEMCHR): Set to __memchr.
>      * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
>      Add memchr_generic and memchr_base.
>      * sysdeps/aarch64/multiarch/ifunc-impl-list.c
>      (__libc_ifunc_impl_list): Add memchr ifuncs.
>      * sysdeps/aarch64/multiarch/memchr.c: New file.
>      * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
>      * sysdeps/aarch64/multiarch/memchr_base.S: Likewise.
> ---
>   ChangeLog                                   |  12 ++
>   sysdeps/aarch64/memchr.S                    |  10 +-
>   sysdeps/aarch64/multiarch/Makefile          |   1 +
>   sysdeps/aarch64/multiarch/ifunc-impl-list.c |   3 +
>   sysdeps/aarch64/multiarch/memchr.c          |  41 +++++
>   sysdeps/aarch64/multiarch/memchr_base.S     | 223 ++++++++++++++++++++++++++++
>   sysdeps/aarch64/multiarch/memchr_generic.S  |  33 ++++
>   7 files changed, 320 insertions(+), 3 deletions(-)
>   create mode 100644 sysdeps/aarch64/multiarch/memchr.c
>   create mode 100644 sysdeps/aarch64/multiarch/memchr_base.S
>   create mode 100644 sysdeps/aarch64/multiarch/memchr_generic.S
> 
> diff --git a/ChangeLog b/ChangeLog
> index b4c07e2..6386b1e 100644
> --- a/ChangeLog
> +++ b/ChangeLog
> @@ -1,5 +1,17 @@
>   2018-12-17  Feng Xue  <fxue@os.amperecomputing.com>
>   
> +	* sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
> +	[!MEMCHR](MEMCHR): Set to __memchr.
> +	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
> +	Add memchr_generic and memchr_base.
> +	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +	(__libc_ifunc_impl_list): Add memchr ifuncs.
> +	* sysdeps/aarch64/multiarch/memchr.c: New file.
> +	* sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
> +	* sysdeps/aarch64/multiarch/memchr_base.S: Likewise.

Please call it something else, say, memchr_nosimd.  Looks OK otherwise.

Thanks,
Siddhesh
  

Patch

diff --git a/ChangeLog b/ChangeLog
index b4c07e2..6386b1e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,17 @@ 
 2018-12-17  Feng Xue  <fxue@os.amperecomputing.com>
 
+	* sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
+	[!MEMCHR](MEMCHR): Set to __memchr.
+	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
+	Add memchr_generic and memchr_base.
+	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add memchr ifuncs.
+	* sysdeps/aarch64/multiarch/memchr.c: New file.
+	* sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
+	* sysdeps/aarch64/multiarch/memchr_base.S: Likewise.
+
+2018-12-17  Feng Xue  <fxue@os.amperecomputing.com>
+
 	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
 	Add memset_emag.
 	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
index e422aef..4afebd3 100644
--- a/sysdeps/aarch64/memchr.S
+++ b/sysdeps/aarch64/memchr.S
@@ -26,6 +26,10 @@ 
  * Neon Available.
  */
 
+#ifndef MEMCHR
+# define MEMCHR __memchr
+#endif
+
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
@@ -59,7 +63,7 @@ 
  * identify exactly which byte has matched.
  */
 
-ENTRY (__memchr)
+ENTRY (MEMCHR)
 	/* Do not dereference srcin if no bytes to compare.  */
 	cbz	cntin, L(zero_length)
 	/*
@@ -152,6 +156,6 @@  L(tail):
 L(zero_length):
 	mov	result, #0
 	ret
-END (__memchr)
-weak_alias (__memchr, memchr)
+END (MEMCHR)
+weak_alias (MEMCHR, memchr)
 libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 3c6c879..83b74c8 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -2,5 +2,6 @@  ifeq ($(subdir),string)
 sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
 		   memcpy_falkor memmove_falkor \
 		   memset_generic memset_falkor memset_emag \
+		   memchr_generic memchr_base \
 		   strlen_generic strlen_asimd
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 6d4dbbe..603966a 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -53,6 +53,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_emag)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
+  IFUNC_IMPL (i, name, memchr,
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_base)
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_generic))
 
   IFUNC_IMPL (i, name, strlen,
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
diff --git a/sysdeps/aarch64/multiarch/memchr.c b/sysdeps/aarch64/multiarch/memchr.c
new file mode 100644
index 0000000..ec79f85
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memchr.c
@@ -0,0 +1,41 @@ 
+/* Multiple versions of memchr. AARCH64 version.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memchr so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memchr
+# define memchr __redirect_memchr
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memchr) __memchr;
+
+extern __typeof (__redirect_memchr) __memchr_generic attribute_hidden;
+extern __typeof (__redirect_memchr) __memchr_base attribute_hidden;
+
+libc_ifunc (__memchr,
+	    ((IS_EMAG (midr)
+	       ? __memchr_base
+	       : __memchr_generic)));
+
+# undef memchr
+strong_alias (__memchr, memchr);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memchr_base.S b/sysdeps/aarch64/multiarch/memchr_base.S
new file mode 100644
index 0000000..03901de
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memchr_base.S
@@ -0,0 +1,223 @@ 
+/* memchr - find a character in a memory zone using base integer registers 
+
+   Copyright (C) 2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Use base integer registers.
+ */
+
+#ifndef MEMCHR
+# define MEMCHR __memchr_base
+#endif
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		x1
+#define cntin		x2
+
+#define result		x0
+
+#define repchr		x1
+
+#define tmp1		x2
+#define tmp2		x3
+#define tmp3		x4
+#define tmp4		x5
+
+#define src		x6
+#define srcend		x7
+#define srcend16	x8
+
+#define anymore		x9
+
+#define zeroones	x10
+
+#define data1		x11
+#define data2		x12
+
+#define has_chr1	x13
+#define has_chr2	x14
+
+#define REP8_01		0x0101010101010101
+#define REP8_7f		0x7f7f7f7f7f7f7f7f
+
+
+ENTRY_ALIGN (MEMCHR, 6)
+
+	DELOUSE (0)
+	DELOUSE (2)
+
+	/* Do not dereference srcin if no bytes to compare. */
+	cbz	cntin, L(none_chr)
+
+	/* Start address is 16-byte aligned or not? */
+	tst	srcin, 15
+	bic	src, srcin, 15
+
+	mov	zeroones, REP8_01
+	and	repchr, chrin, 255
+	/* Generate a qword integer as |c|c|c|c|c|c|c|c|. */
+	mul	repchr, repchr, zeroones
+
+	add	srcend, srcin, cntin
+	/*
+	 * srcend16 is address of the block following the last block.
+	 *
+	 * [A block is 16-byte aligned and sized.]
+	 */
+	add	srcend16, srcend, 15
+	bic	srcend16, srcend16, 15
+
+	b.eq	L(loop)
+
+	/* Load the first block containing start address. */
+	ldp	data1, data2, [src], 16
+
+	lsl	tmp1, srcin, 3
+	mov	tmp2, ~0
+#ifdef __AARCH64EB__
+	lsr	tmp3, tmp2, tmp1
+#else
+	lsl	tmp3, tmp2, tmp1
+#endif
+	/* Start address is in the first or the second qword? */
+	tst	srcin, 8
+
+	/*
+	 * Transform any byte in the block to zero using XOR operation,
+	 * if that byte equals the char to search. In this way, searching
+	 * the char becomes detecting zero in the resulting two qwords.
+	 */
+	eor	data1, data1, repchr
+	eor	data2, data2, repchr
+
+	/*
+	 * Set those unused bytes(before start address) to 0xff, so
+	 * that they will not hit any zero detection.
+	 */
+	orn	tmp1, data1, tmp3
+	orn	tmp2, data2, tmp3
+
+	csinv	data1, tmp1, xzr, eq
+	csel	data2, data2, tmp2, eq
+
+	/*
+	 * When the first and last block are the same, there are two cases:
+	 *  o. Memory range to search is just in one block.
+	 *      ( start address - end address) < 0
+	 *
+	 *  o. Memory range is so large that end address wrap-around.
+	 *      ( start address - end address) > 0
+	 */
+	cmp	srcin, srcend
+	ccmp	src, srcend16, 0, mi
+	csetm	anymore, ne
+	b	L(find_chr)
+
+	.p2align 4
+L(loop):
+	ldp	data1, data2, [src], 16
+
+	subs	anymore, src, srcend16
+
+	/*
+	 * Transform any byte in the block to zero using XOR operation,
+	 * if that byte equals the char to search.
+	 */
+	eor	data1, data1, repchr
+	eor	data2, data2, repchr
+
+L(find_chr):
+	/*
+	 * Use the following integer test to find out if any byte in a
+	 * qword is zero. If do not contain zero-valued byte, test result
+	 * is zero.
+	 *
+	 *  (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080
+	 * =
+	 *  (qword - 0x0101010101010101) & ~(qword  | 0x7f7f7f7f7f7f7f7f)
+	 *
+	 */
+	sub	tmp1, data1, zeroones
+	sub	tmp2, data2, zeroones
+
+	orr	tmp3, data1, REP8_7f
+	orr	tmp4, data2, REP8_7f
+
+	bic	has_chr1, tmp1, tmp3
+	bic	has_chr2, tmp2, tmp4
+
+	orr	tmp1, has_chr1, has_chr2
+	ccmp	tmp1, 0, 0, ne
+
+	b.eq	L(loop)
+
+	cbz	has_chr1, 1f
+	sub	result, src, 16
+#ifdef __AARCH64EB__
+	rev	data1, data1
+#else
+	rev	has_chr1, has_chr1
+#endif
+	b	L(done)
+
+1:	cbz	has_chr2, L(none_chr)
+	sub	result, src, 8
+#ifdef __AARCH64EB__
+	rev	data1, data2
+#else
+	rev	has_chr1, has_chr2
+#endif
+
+L(done):
+#ifdef __AARCH64EB__
+	/*
+	 * For big-endian, can not directly use has_chr1/has_chr2 because
+	 * two qwords has been reversed after loading from memory.
+	 * Thus, have to perform char detection on two qwords again, which
+	 * should be byte-swapped this time.
+	 */
+	sub	tmp1, data1, zeroones
+	orr	tmp3, data1, REP8_7f
+	bic	has_chr1, tmp1, tmp3
+	rev	has_chr1, has_chr1
+#endif
+
+	/*
+	 * If the specified char is found in a qword, the corresponding
+	 * byte of in has_chr has value of 1, while this is only true for
+	 * the first occurrence, not other occurrences.
+	 */
+	cmp	anymore, 0
+	clz	tmp1, has_chr1
+	add	result, result, tmp1, lsr 3
+	ccmp	result, srcend, 8, eq	/* NZCV = 8000 */
+	csel	result, result, xzr, mi
+	ret
+
+L(none_chr):
+	mov	result, 0
+	ret
+
+END (MEMCHR)
+libc_hidden_builtin_def (MEMCHR)
diff --git a/sysdeps/aarch64/multiarch/memchr_generic.S b/sysdeps/aarch64/multiarch/memchr_generic.S
new file mode 100644
index 0000000..707148b
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memchr_generic.S
@@ -0,0 +1,33 @@ 
+/* Memchr for aarch64, default version for internal use.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMCHR __memchr_generic
+
+/* Do not hide the generic version of memchr, we use it internally.  */
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+
+/* Add a hidden definition for use within libc.so.  */
+# ifdef SHARED
+	.globl __GI_memchr; __GI_memchr = __memchr_generic
+# endif
+#endif
+
+# include "../memchr.S"