aarch64: Optimized memcmp for Kunpeng processor.

Message ID 20191017144750.21760-1-zhangxuelei4@huawei.com
State Superseded
Headers

Commit Message

Xuelei Zhang Oct. 17, 2019, 2:47 p.m. UTC
  The loop body is expanded from a 16-byte comparison to a 64-byte
comparison, and the usage of ldp is replaced by the Post-index
mode to the Base plus offset mode. Hence, compare can faster 18%
around > 128 bytes in all.

Here is the result.

Function: memcmp
Variant: default
                                    simple_memcmp	__memcmp_kunpeng	__memcmp_generic
  

Patch

========================================================================================================================
        length=1, align1=0, align2=0:        14.06 ( -9.76%)	       12.81 (  0.00%)	       12.81
        length=1, align1=0, align2=0:        13.59 (-10.13%)	       12.66 ( -2.53%)	       12.34
        length=1, align1=0, align2=0:        13.12 ( -7.69%)	       12.50 ( -2.56%)	       12.19
        length=2, align1=0, align2=0:        14.84 ( -6.74%)	       14.06 ( -1.12%)	       13.91
        length=2, align1=0, align2=0:        14.69 ( -6.82%)	       12.97 (  5.68%)	       13.75
        length=2, align1=0, align2=0:        14.38 ( -5.75%)	       13.28 (  2.30%)	       13.59
        length=3, align1=0, align2=0:        15.47 (  0.00%)	       14.22 (  8.08%)	       15.47
        length=3, align1=0, align2=0:        15.62 ( -8.70%)	       13.91 (  3.26%)	       14.38
        length=3, align1=0, align2=0:        15.47 ( -7.61%)	       13.91 (  3.26%)	       14.38
        length=4, align1=0, align2=0:        16.72 (-24.42%)	       13.12 (  2.33%)	       13.44
        length=4, align1=0, align2=0:        17.19 (-35.80%)	       13.12 ( -3.70%)	       12.66
        length=4, align1=0, align2=0:        16.56 (-29.27%)	       12.19 (  4.88%)	       12.81
        length=5, align1=0, align2=0:        16.88 (-28.57%)	       12.81 (  2.38%)	       13.12
        length=5, align1=0, align2=0:        17.97 (-49.35%)	       12.34 ( -2.60%)	       12.03
        length=5, align1=0, align2=0:        17.19 (-39.24%)	       11.88 (  3.80%)	       12.34
        length=6, align1=0, align2=0:        18.75 (-29.03%)	       12.34 ( 15.05%)	       14.53
        length=6, align1=0, align2=0:        18.44 (-35.63%)	       12.34 (  9.20%)	       13.59
        length=6, align1=0, align2=0:        17.81 (-28.09%)	       12.50 ( 10.11%)	       13.91
        length=7, align1=0, align2=0:        20.62 (-37.50%)	       12.66 ( 15.63%)	       15.00
        length=7, align1=0, align2=0:        18.75 (-27.66%)	       12.34 ( 15.96%)	       14.69
        length=7, align1=0, align2=0:        18.28 (-27.17%)	       12.50 ( 13.04%)	       14.38
        length=8, align1=0, align2=0:        19.84 (-64.94%)	       12.81 ( -6.49%)	       12.03
        length=8, align1=0, align2=0:        20.00 (-66.23%)	       11.56 (  3.90%)	       12.03
        length=8, align1=0, align2=0:        18.91 (-59.21%)	       11.41 (  3.95%)	       11.88
        length=9, align1=0, align2=0:        20.31 (-66.67%)	       12.66 ( -3.85%)	       12.19
        length=9, align1=0, align2=0:        19.84 (-71.62%)	       11.88 ( -2.70%)	       11.56
        length=9, align1=0, align2=0:        20.00 (-62.02%)	       11.88 (  3.80%)	       12.34
       length=10, align1=0, align2=0:        21.72 (-82.90%)	       12.34 ( -3.95%)	       11.88
       length=10, align1=0, align2=0:        21.56 (-81.58%)	       12.03 ( -1.32%)	       11.88
       length=10, align1=0, align2=0:        20.94 (-71.79%)	       11.72 (  3.85%)	       12.19
       length=11, align1=0, align2=0:        21.41 (-75.64%)	       12.03 (  1.28%)	       12.19
       length=11, align1=0, align2=0:        22.81 (-87.18%)	       12.19 (  0.00%)	       12.19
       length=11, align1=0, align2=0:        21.41 (-77.92%)	       12.03 (  0.00%)	       12.03
       length=12, align1=0, align2=0:        22.50 (-89.47%)	       11.88 (  0.00%)	       11.88
       length=12, align1=0, align2=0:        22.97 (-93.42%)	       12.66 ( -6.58%)	       11.88
       length=12, align1=0, align2=0:        21.72 (-80.52%)	       11.72 (  2.60%)	       12.03
       length=13, align1=0, align2=0:        23.28 (-101.35%)	       12.50 ( -8.11%)	       11.56
       length=13, align1=0, align2=0:        23.28 (-93.51%)	       12.34 ( -2.60%)	       12.03
       length=13, align1=0, align2=0:        23.12 (-92.21%)	       12.19 ( -1.30%)	       12.03
       length=14, align1=0, align2=0:        26.56 (-117.95%)	       12.03 (  1.28%)	       12.19
       length=14, align1=0, align2=0:        24.06 (-94.94%)	       12.19 (  1.27%)	       12.34
       length=14, align1=0, align2=0:        23.59 (-98.68%)	       11.88 (  0.00%)	       11.88
       length=15, align1=0, align2=0:        24.69 (-100.00%)	       12.19 (  1.27%)	       12.34
       length=15, align1=0, align2=0:        24.53 (-101.28%)	       11.88 (  2.56%)	       12.19
       length=15, align1=0, align2=0:        24.22 (-101.30%)	       11.41 (  5.19%)	       12.03
        length=4, align1=0, align2=0:        16.09 (-27.16%)	       13.12 ( -3.70%)	       12.66
        length=4, align1=0, align2=0:        16.09 (-27.16%)	       12.66 (  0.00%)	       12.66
        length=4, align1=0, align2=0:        15.62 (-29.87%)	       12.50 ( -3.90%)	       12.03
       length=32, align1=0, align2=0:        37.81 (-181.40%)	       13.12 (  2.33%)	       13.44
       length=32, align1=7, align2=2:        37.66 (-197.53%)	       12.03 (  4.94%)	       12.66
       length=32, align1=0, align2=0:        37.97 (-189.29%)	       12.34 (  5.95%)	       13.12
       length=32, align1=0, align2=0:        37.19 (-190.24%)	       11.72 (  8.54%)	       12.81
        length=8, align1=0, align2=0:        20.00 (-62.02%)	       12.34 (  0.00%)	       12.34
        length=8, align1=0, align2=0:        19.38 (-51.22%)	       11.25 ( 12.20%)	       12.81
        length=8, align1=0, align2=0:        19.22 (-64.00%)	       11.72 (  0.00%)	       11.72
       length=64, align1=0, align2=0:        62.97 (-97.55%)	       15.62 ( 50.98%)	       31.88
       length=64, align1=6, align2=4:        62.34 (-74.24%)	       15.94 ( 55.46%)	       35.78
       length=64, align1=0, align2=0:        70.16 (-138.83%)	       14.06 ( 52.13%)	       29.38
       length=64, align1=0, align2=0:        69.53 (-130.57%)	       14.53 ( 51.81%)	       30.16
       length=16, align1=0, align2=0:        25.31 (-105.06%)	       11.72 (  5.06%)	       12.34
       length=16, align1=0, align2=0:        25.78 (-114.29%)	       12.97 ( -7.79%)	       12.03
       length=16, align1=0, align2=0:        25.16 (-111.84%)	       11.88 (  0.00%)	       11.88
      length=128, align1=0, align2=0:       119.22 (-515.32%)	       20.00 ( -3.23%)	       19.38
      length=128, align1=5, align2=6:       121.25 (-496.92%)	       19.38 (  4.62%)	       20.31
      length=128, align1=0, align2=0:       119.38 (-542.02%)	       18.75 ( -0.84%)	       18.59
      length=128, align1=0, align2=0:       119.53 (-542.85%)	       18.28 (  1.68%)	       18.59
       length=32, align1=0, align2=0:        37.66 (-186.90%)	       12.03 (  8.33%)	       13.12
       length=32, align1=0, align2=0:        37.50 (-192.68%)	       11.72 (  8.54%)	       12.81
       length=32, align1=0, align2=0:        37.19 (-190.24%)	       11.72 (  8.54%)	       12.81
      length=256, align1=0, align2=0:       218.44 (-685.40%)	       26.88 (  3.37%)	       27.81
      length=256, align1=4, align2=8:       218.12 (-512.28%)	       29.69 ( 16.67%)	       35.62
      length=256, align1=0, align2=0:       219.84 (-713.30%)	       24.53 (  9.25%)	       27.03
      length=256, align1=0, align2=0:       217.97 (-711.05%)	       23.91 ( 11.05%)	       26.88
       length=64, align1=0, align2=0:        62.81 (-101.00%)	       14.38 ( 54.00%)	       31.25
       length=64, align1=0, align2=0:        71.09 (-144.62%)	       14.06 ( 51.61%)	       29.06
       length=64, align1=0, align2=0:        70.47 (-145.11%)	       13.75 ( 52.17%)	       28.75
      length=512, align1=0, align2=0:       416.25 (-861.73%)	       38.59 ( 10.83%)	       43.28
     length=512, align1=3, align2=10:       416.25 (-627.87%)	       44.84 ( 21.58%)	       57.19
      length=512, align1=0, align2=0:       414.69 (-844.49%)	       37.66 ( 14.23%)	       43.91
      length=512, align1=0, align2=0:       414.84 (-883.33%)	       35.94 ( 14.81%)	       42.19
      length=128, align1=0, align2=0:       119.22 (-478.03%)	       19.22 (  6.82%)	       20.62
      length=128, align1=0, align2=0:       119.38 (-521.14%)	       18.28 (  4.88%)	       19.22
      length=128, align1=0, align2=0:       119.38 (-536.67%)	       17.97 (  4.17%)	       18.75
     length=1024, align1=0, align2=0:       809.53 (-995.35%)	       61.09 ( 17.34%)	       73.91
    length=1024, align1=2, align2=12:       810.16 (-839.31%)	       73.44 ( 14.86%)	       86.25
     length=1024, align1=0, align2=0:       808.59 (-998.73%)	       60.78 ( 17.41%)	       73.59
     length=1024, align1=0, align2=0:       808.28 (-1007.71%)	       60.47 ( 17.13%)	       72.97
      length=256, align1=0, align2=0:       217.19 (-689.77%)	       25.62 (  6.82%)	       27.50
      length=256, align1=0, align2=0:       217.81 (-715.20%)	       23.91 ( 10.53%)	       26.72
      length=256, align1=0, align2=0:       217.66 (-700.57%)	       24.22 ( 10.92%)	       27.19
     length=2048, align1=0, align2=0:      1597.50 (-954.02%)	      110.16 ( 27.32%)	      151.56
    length=2048, align1=1, align2=14:      1597.19 (-960.38%)	      130.78 ( 13.17%)	      150.62
     length=2048, align1=0, align2=0:      1596.09 (-1083.66%)	      110.00 ( 18.42%)	      134.84
     length=2048, align1=0, align2=0:      1601.25 (-1091.63%)	      113.12 ( 15.81%)	      134.38
      length=512, align1=0, align2=0:       414.38 (-857.40%)	       37.03 ( 14.44%)	       43.28
      length=512, align1=0, align2=0:       419.53 (-890.77%)	       35.94 ( 15.13%)	       42.34
      length=512, align1=0, align2=0:       414.69 (-890.30%)	       36.09 ( 13.81%)	       41.88
     length=4096, align1=0, align2=0:      3172.66 (-1067.63%)	      217.66 ( 19.90%)	      271.72
    length=4096, align1=0, align2=16:      3172.81 (-1084.02%)	      209.22 ( 21.92%)	      267.97
     length=4096, align1=0, align2=0:      3172.34 (-1062.16%)	      209.22 ( 23.35%)	      272.97
     length=4096, align1=0, align2=0:      3172.34 (-1082.47%)	      208.59 ( 22.25%)	      268.28
     length=1024, align1=0, align2=0:       810.00 (-1007.69%)	       60.78 ( 16.88%)	       73.12
     length=1024, align1=0, align2=0:       813.75 (-1015.20%)	       60.78 ( 16.70%)	       72.97
     length=1024, align1=0, align2=0:       808.12 (-1000.43%)	       60.62 ( 17.45%)	       73.44
       length=16, align1=1, align2=2:        27.19 (-104.71%)	       12.03 (  9.41%)	       13.28
       length=16, align1=1, align2=2:        25.47 (-103.75%)	       12.03 (  3.75%)	       12.50
       length=16, align1=1, align2=2:        24.84 (-101.27%)	       11.88 (  3.80%)	       12.34
       length=32, align1=2, align2=4:        38.12 (-183.72%)	       12.19 (  9.30%)	       13.44
       length=32, align1=2, align2=4:        38.12 (-193.97%)	       12.19 (  6.02%)	       12.97
       length=32, align1=2, align2=4:        37.34 (-191.46%)	       12.34 (  3.66%)	       12.81
       length=64, align1=3, align2=6:        72.66 (-121.43%)	       15.47 ( 52.86%)	       32.81
       length=64, align1=3, align2=6:        73.44 (-162.57%)	       15.00 ( 46.37%)	       27.97
       length=64, align1=3, align2=6:        72.97 (-171.51%)	       14.69 ( 45.35%)	       26.88
      length=128, align1=4, align2=8:       118.91 (-428.47%)	       20.00 ( 11.11%)	       22.50
      length=128, align1=4, align2=8:       119.53 (-470.89%)	       19.53 (  6.72%)	       20.94
      length=128, align1=4, align2=8:       119.69 (-484.73%)	       19.53 (  4.58%)	       20.47
     length=256, align1=5, align2=10:       221.09 (-567.45%)	       30.78 (  7.08%)	       33.12
     length=256, align1=5, align2=10:       217.81 (-593.53%)	       29.53 (  5.97%)	       31.41
     length=256, align1=5, align2=10:       217.19 (-584.73%)	       29.38 (  7.39%)	       31.72
     length=512, align1=6, align2=12:       416.56 (-733.12%)	       43.75 ( 12.50%)	       50.00
     length=512, align1=6, align2=12:       413.91 (-740.95%)	       44.22 ( 10.16%)	       49.22
     length=512, align1=6, align2=12:       414.22 (-749.68%)	       43.75 ( 10.26%)	       48.75
    length=1024, align1=7, align2=14:       809.53 (-836.89%)	       75.94 ( 12.12%)	       86.41
    length=1024, align1=7, align2=14:       807.81 (-843.43%)	       75.94 ( 11.31%)	       85.62
    length=1024, align1=7, align2=14:       807.66 (-859.00%)	       75.62 ( 10.20%)	       84.22
---
 sysdeps/aarch64/memcmp.S                    |  12 +-
 sysdeps/aarch64/multiarch/Makefile          |   1 +
 sysdeps/aarch64/multiarch/ifunc-impl-list.c |   3 +
 sysdeps/aarch64/multiarch/memcmp.c          |  42 +++++++
 sysdeps/aarch64/multiarch/memcmp_generic.S  |  35 ++++++
 sysdeps/aarch64/multiarch/memcmp_kunpeng.S  | 187 ++++++++++++++++++++++++++++
 6 files changed, 276 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/memcmp.c
 create mode 100644 sysdeps/aarch64/multiarch/memcmp_generic.S
 create mode 100644 sysdeps/aarch64/multiarch/memcmp_kunpeng.S

diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index f330154c7a..40ecbddb94 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -25,6 +25,10 @@ 
  * ARMv8-a, AArch64, unaligned accesses.
  */
 
+#ifndef	MEMCMP
+# define MEMCMP memcmp
+#endif 
+ 
 /* Parameters and result.  */
 #define src1		x0
 #define src2		x1
@@ -41,7 +45,7 @@ 
 #define tmp1		x7
 #define tmp2		x8
 
-ENTRY_ALIGN (memcmp, 6)
+ENTRY_ALIGN (MEMCMP, 6)
 	DELOUSE (0)
 	DELOUSE (1)
 	DELOUSE (2)
@@ -148,7 +152,7 @@  L(byte_loop):
 	sub	result, data1w, data2w
 	ret
 
-END (memcmp)
+END (MEMCMP)
 #undef bcmp
-weak_alias (memcmp, bcmp)
-libc_hidden_builtin_def (memcmp)
+weak_alias (MEMCMP, bcmp)
+libc_hidden_builtin_def (MEMCMP)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 4150b89a90..eedb8e486d 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,6 +1,7 @@ 
 ifeq ($(subdir),string)
 sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
 		   memcpy_falkor memmove_falkor \
+		   memcmp_kunpeng memcmp_generic \
 		   memset_generic memset_falkor memset_emag \
 		   memchr_generic memchr_nosimd \
 		   strlen_generic strlen_asimd
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index be13b916e5..fdbc751897 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -57,6 +57,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, memchr,
 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_nosimd)
 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_generic))
+..IFUNC_IMPL (i, name, memcmp,
+	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_kunpeng)
+	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_generic))
 
   IFUNC_IMPL (i, name, strlen,
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
diff --git a/sysdeps/aarch64/multiarch/memcmp.c b/sysdeps/aarch64/multiarch/memcmp.c
new file mode 100644
index 0000000000..276bc1e30e
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcmp.c
@@ -0,0 +1,42 @@ 
+/* Multiple versions of memcmp. AARCH64 version.
+   Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memcmp so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memcmp
+# define memcmp __redirect_memcmp
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memcmp) __libc_memcmp;
+
+extern __typeof (__redirect_memcmp) __memcmp_generic attribute_hidden;
+extern __typeof (__redirect_memcmp) __memcmp_kunpeng attribute_hidden;
+
+libc_ifunc (__libc_memcmp,
+            (IS_KUNPENG(midr)
+	    ? __memcmp_kunpeng
+		  : __memcmp_generic));
+
+
+# undef memcmp
+strong_alias (__libc_memcmp, memcmp);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcmp_generic.S b/sysdeps/aarch64/multiarch/memcmp_generic.S
new file mode 100644
index 0000000000..88bac46075
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcmp_generic.S
@@ -0,0 +1,35 @@ 
+/* A Generic Optimized memcmp implementation for AARCH64.
+   Copyright (C) 2018-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+# define MEMCMP __memcmp_generic
+
+/* Do not hide the generic version of memcmp, we use it internally.  */
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal memcmp calls through a PLT. */
+	.globl __GI_memcmp; __GI_memcmp = __memcmp_generic
+# endif
+#endif
+
+#include "../memcmp.S"
\ No newline at end of file
diff --git a/sysdeps/aarch64/multiarch/memcmp_kunpeng.S b/sysdeps/aarch64/multiarch/memcmp_kunpeng.S
new file mode 100644
index 0000000000..30c937a18d
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcmp_kunpeng.S
@@ -0,0 +1,187 @@ 
+/* Optimized memcmp for Huawei Kunpeng processor.
+
+   Copyright (C) 2013-2019 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		w0
+
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8
+
+#if IS_IN (libc)
+#define	MEMCMP	__memcmp_kunpeng
+
+ENTRY_ALIGN (MEMCMP, 6)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	subs    limit, limit, 16
+	b.lo    L(less16)
+	
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	ccmp	data1, data2, 0, ne
+	ccmp	data1h, data2h, 0, eq
+	b.ne	L(return64)
+
+	subs    limit, limit, 16
+	b.ls    L(last_bytes)
+	cmp     limit, 112
+	b.lo    L(loop16)
+
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1
+	subs 	limit, limit, 48
+
+	/* Compare 128 up bytes using aligned access.  */
+	.p2align 4
+L(loop64):
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp     data1, data2
+	ccmp	data1h, data2h, 0, eq
+	b.ne	L(return64)
+	
+	ldp     data1, data1h, [src1, 16]
+	ldp	data2, data2h, [src2, 16]
+	cmp     data1, data2
+	ccmp	data1h, data2h, 0, eq
+	b.ne	L(return64)
+ 
+	ldp	data1, data1h, [src1, 32]
+	ldp	data2, data2h, [src2, 32]
+	cmp     data1, data2
+	ccmp	data1h, data2h, 0, eq
+	b.ne	L(return64)
+
+	ldp	data1, data1h, [src1, 48]
+	ldp	data2, data2h, [src2, 48]
+	cmp     data1, data2
+	ccmp	data1h, data2h, 0, eq
+	b.ne	L(return64)
+
+	subs    limit, limit, 64
+	add     src1, src1, 64
+	add     src2, src2, 64
+	b.pl    L(loop64)
+	adds    limit, limit, 48
+	b.lo	L(last_bytes)  
+	
+L(loop16):
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	cmp     data1, data2
+	ccmp	data1h, data2h, 0, eq
+	b.ne	L(return64)
+	
+	subs    limit, limit, 16
+	b.hi    L(loop16)
+	/* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+L(return64):
+	cmp	data1, data2
+	bne	L(return)
+L(return_pre):
+	mov	data1, data1h
+	mov	data2, data2h
+L(return):
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	cmp     data1, data2
+L(ret_eq):
+	cset	result, ne   
+	cneg	result, result, lo
+	ret
+
+	.p2align 4
+L(less16):
+	adds	limit, limit, 8
+	b.lo	L(less8)		//lo:<
+	ldr	data1, [src1]
+	ldr	data2, [src2]
+	/* equal 8 optimized */
+	ccmp    data1, data2, 0, ne
+	b.ne	L(return)
+
+	ldr     data1, [src1, limit]
+	ldr     data2, [src2, limit]
+	b       L(return)
+
+	.p2align 4
+L(less8):
+	adds	limit, limit, 4
+	b.lo	L(less4)
+	ldr	data1w, [src1]
+	ldr	data2w, [src2]
+	ccmp    data1, data2, 0, ne
+	b.ne	L(return)
+	ldr     data1w,	[src1, limit]
+	ldr     data2w,	[src2, limit]
+	b	L(return)
+
+	.p2align 4
+L(less4):
+	adds	limit, limit, 4
+	beq	L(ret_0)
+
+L(byte_loop):
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+	sub	result, data1w, data2w
+	ret
+L(ret_0):
+	mov	result, 0
+	ret
+
+END (MEMCMP)
+#undef bcmp
+weak_alias (MEMCMP, bcmp)
+libc_hidden_builtin_def (MEMCMP)
+#endif