From patchwork Thu Oct 17 14:47:50 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xuelei Zhang X-Patchwork-Id: 35093 Received: (qmail 128192 invoked by alias); 17 Oct 2019 14:48:04 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 128148 invoked by uid 89); 17 Oct 2019 14:48:03 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-16.1 required=5.0 tests=AWL, BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_ASCII_DIVIDERS, KAM_SHORT, SPF_HELO_PASS, SPF_PASS autolearn=ham version=3.3.1 spammy=11000, 1-16, 1513, 1505 X-HELO: huawei.com From: Xuelei Zhang To: , , , , Subject: [PATCH] aarch64: Optimized memcmp for Kunpeng processor. Date: Thu, 17 Oct 2019 22:47:50 +0800 Message-ID: <20191017144750.21760-1-zhangxuelei4@huawei.com> MIME-Version: 1.0 The loop body is expanded from a 16-byte comparison to a 64-byte comparison, and the usage of ldp is replaced by the Post-index mode to the Base plus offset mode. Hence, compare can faster 18% around > 128 bytes in all. Here is the result. Function: memcmp Variant: default simple_memcmp __memcmp_kunpeng __memcmp_generic ======================================================================================================================== length=1, align1=0, align2=0: 14.06 ( -9.76%) 12.81 ( 0.00%) 12.81 length=1, align1=0, align2=0: 13.59 (-10.13%) 12.66 ( -2.53%) 12.34 length=1, align1=0, align2=0: 13.12 ( -7.69%) 12.50 ( -2.56%) 12.19 length=2, align1=0, align2=0: 14.84 ( -6.74%) 14.06 ( -1.12%) 13.91 length=2, align1=0, align2=0: 14.69 ( -6.82%) 12.97 ( 5.68%) 13.75 length=2, align1=0, align2=0: 14.38 ( -5.75%) 13.28 ( 2.30%) 13.59 length=3, align1=0, align2=0: 15.47 ( 0.00%) 14.22 ( 8.08%) 15.47 length=3, align1=0, align2=0: 15.62 ( -8.70%) 13.91 ( 3.26%) 14.38 length=3, align1=0, align2=0: 15.47 ( -7.61%) 13.91 ( 3.26%) 14.38 length=4, align1=0, align2=0: 16.72 (-24.42%) 13.12 ( 2.33%) 13.44 length=4, align1=0, align2=0: 17.19 (-35.80%) 13.12 ( -3.70%) 12.66 length=4, align1=0, align2=0: 16.56 (-29.27%) 12.19 ( 4.88%) 12.81 length=5, align1=0, align2=0: 16.88 (-28.57%) 12.81 ( 2.38%) 13.12 length=5, align1=0, align2=0: 17.97 (-49.35%) 12.34 ( -2.60%) 12.03 length=5, align1=0, align2=0: 17.19 (-39.24%) 11.88 ( 3.80%) 12.34 length=6, align1=0, align2=0: 18.75 (-29.03%) 12.34 ( 15.05%) 14.53 length=6, align1=0, align2=0: 18.44 (-35.63%) 12.34 ( 9.20%) 13.59 length=6, align1=0, align2=0: 17.81 (-28.09%) 12.50 ( 10.11%) 13.91 length=7, align1=0, align2=0: 20.62 (-37.50%) 12.66 ( 15.63%) 15.00 length=7, align1=0, align2=0: 18.75 (-27.66%) 12.34 ( 15.96%) 14.69 length=7, align1=0, align2=0: 18.28 (-27.17%) 12.50 ( 13.04%) 14.38 length=8, align1=0, align2=0: 19.84 (-64.94%) 12.81 ( -6.49%) 12.03 length=8, align1=0, align2=0: 20.00 (-66.23%) 11.56 ( 3.90%) 12.03 length=8, align1=0, align2=0: 18.91 (-59.21%) 11.41 ( 3.95%) 11.88 length=9, align1=0, align2=0: 20.31 (-66.67%) 12.66 ( -3.85%) 12.19 length=9, align1=0, align2=0: 19.84 (-71.62%) 11.88 ( -2.70%) 11.56 length=9, align1=0, align2=0: 20.00 (-62.02%) 11.88 ( 3.80%) 12.34 length=10, align1=0, align2=0: 21.72 (-82.90%) 12.34 ( -3.95%) 11.88 length=10, align1=0, align2=0: 21.56 (-81.58%) 12.03 ( -1.32%) 11.88 length=10, align1=0, align2=0: 20.94 (-71.79%) 11.72 ( 3.85%) 12.19 length=11, align1=0, align2=0: 21.41 (-75.64%) 12.03 ( 1.28%) 12.19 length=11, align1=0, align2=0: 22.81 (-87.18%) 12.19 ( 0.00%) 12.19 length=11, align1=0, align2=0: 21.41 (-77.92%) 12.03 ( 0.00%) 12.03 length=12, align1=0, align2=0: 22.50 (-89.47%) 11.88 ( 0.00%) 11.88 length=12, align1=0, align2=0: 22.97 (-93.42%) 12.66 ( -6.58%) 11.88 length=12, align1=0, align2=0: 21.72 (-80.52%) 11.72 ( 2.60%) 12.03 length=13, align1=0, align2=0: 23.28 (-101.35%) 12.50 ( -8.11%) 11.56 length=13, align1=0, align2=0: 23.28 (-93.51%) 12.34 ( -2.60%) 12.03 length=13, align1=0, align2=0: 23.12 (-92.21%) 12.19 ( -1.30%) 12.03 length=14, align1=0, align2=0: 26.56 (-117.95%) 12.03 ( 1.28%) 12.19 length=14, align1=0, align2=0: 24.06 (-94.94%) 12.19 ( 1.27%) 12.34 length=14, align1=0, align2=0: 23.59 (-98.68%) 11.88 ( 0.00%) 11.88 length=15, align1=0, align2=0: 24.69 (-100.00%) 12.19 ( 1.27%) 12.34 length=15, align1=0, align2=0: 24.53 (-101.28%) 11.88 ( 2.56%) 12.19 length=15, align1=0, align2=0: 24.22 (-101.30%) 11.41 ( 5.19%) 12.03 length=4, align1=0, align2=0: 16.09 (-27.16%) 13.12 ( -3.70%) 12.66 length=4, align1=0, align2=0: 16.09 (-27.16%) 12.66 ( 0.00%) 12.66 length=4, align1=0, align2=0: 15.62 (-29.87%) 12.50 ( -3.90%) 12.03 length=32, align1=0, align2=0: 37.81 (-181.40%) 13.12 ( 2.33%) 13.44 length=32, align1=7, align2=2: 37.66 (-197.53%) 12.03 ( 4.94%) 12.66 length=32, align1=0, align2=0: 37.97 (-189.29%) 12.34 ( 5.95%) 13.12 length=32, align1=0, align2=0: 37.19 (-190.24%) 11.72 ( 8.54%) 12.81 length=8, align1=0, align2=0: 20.00 (-62.02%) 12.34 ( 0.00%) 12.34 length=8, align1=0, align2=0: 19.38 (-51.22%) 11.25 ( 12.20%) 12.81 length=8, align1=0, align2=0: 19.22 (-64.00%) 11.72 ( 0.00%) 11.72 length=64, align1=0, align2=0: 62.97 (-97.55%) 15.62 ( 50.98%) 31.88 length=64, align1=6, align2=4: 62.34 (-74.24%) 15.94 ( 55.46%) 35.78 length=64, align1=0, align2=0: 70.16 (-138.83%) 14.06 ( 52.13%) 29.38 length=64, align1=0, align2=0: 69.53 (-130.57%) 14.53 ( 51.81%) 30.16 length=16, align1=0, align2=0: 25.31 (-105.06%) 11.72 ( 5.06%) 12.34 length=16, align1=0, align2=0: 25.78 (-114.29%) 12.97 ( -7.79%) 12.03 length=16, align1=0, align2=0: 25.16 (-111.84%) 11.88 ( 0.00%) 11.88 length=128, align1=0, align2=0: 119.22 (-515.32%) 20.00 ( -3.23%) 19.38 length=128, align1=5, align2=6: 121.25 (-496.92%) 19.38 ( 4.62%) 20.31 length=128, align1=0, align2=0: 119.38 (-542.02%) 18.75 ( -0.84%) 18.59 length=128, align1=0, align2=0: 119.53 (-542.85%) 18.28 ( 1.68%) 18.59 length=32, align1=0, align2=0: 37.66 (-186.90%) 12.03 ( 8.33%) 13.12 length=32, align1=0, align2=0: 37.50 (-192.68%) 11.72 ( 8.54%) 12.81 length=32, align1=0, align2=0: 37.19 (-190.24%) 11.72 ( 8.54%) 12.81 length=256, align1=0, align2=0: 218.44 (-685.40%) 26.88 ( 3.37%) 27.81 length=256, align1=4, align2=8: 218.12 (-512.28%) 29.69 ( 16.67%) 35.62 length=256, align1=0, align2=0: 219.84 (-713.30%) 24.53 ( 9.25%) 27.03 length=256, align1=0, align2=0: 217.97 (-711.05%) 23.91 ( 11.05%) 26.88 length=64, align1=0, align2=0: 62.81 (-101.00%) 14.38 ( 54.00%) 31.25 length=64, align1=0, align2=0: 71.09 (-144.62%) 14.06 ( 51.61%) 29.06 length=64, align1=0, align2=0: 70.47 (-145.11%) 13.75 ( 52.17%) 28.75 length=512, align1=0, align2=0: 416.25 (-861.73%) 38.59 ( 10.83%) 43.28 length=512, align1=3, align2=10: 416.25 (-627.87%) 44.84 ( 21.58%) 57.19 length=512, align1=0, align2=0: 414.69 (-844.49%) 37.66 ( 14.23%) 43.91 length=512, align1=0, align2=0: 414.84 (-883.33%) 35.94 ( 14.81%) 42.19 length=128, align1=0, align2=0: 119.22 (-478.03%) 19.22 ( 6.82%) 20.62 length=128, align1=0, align2=0: 119.38 (-521.14%) 18.28 ( 4.88%) 19.22 length=128, align1=0, align2=0: 119.38 (-536.67%) 17.97 ( 4.17%) 18.75 length=1024, align1=0, align2=0: 809.53 (-995.35%) 61.09 ( 17.34%) 73.91 length=1024, align1=2, align2=12: 810.16 (-839.31%) 73.44 ( 14.86%) 86.25 length=1024, align1=0, align2=0: 808.59 (-998.73%) 60.78 ( 17.41%) 73.59 length=1024, align1=0, align2=0: 808.28 (-1007.71%) 60.47 ( 17.13%) 72.97 length=256, align1=0, align2=0: 217.19 (-689.77%) 25.62 ( 6.82%) 27.50 length=256, align1=0, align2=0: 217.81 (-715.20%) 23.91 ( 10.53%) 26.72 length=256, align1=0, align2=0: 217.66 (-700.57%) 24.22 ( 10.92%) 27.19 length=2048, align1=0, align2=0: 1597.50 (-954.02%) 110.16 ( 27.32%) 151.56 length=2048, align1=1, align2=14: 1597.19 (-960.38%) 130.78 ( 13.17%) 150.62 length=2048, align1=0, align2=0: 1596.09 (-1083.66%) 110.00 ( 18.42%) 134.84 length=2048, align1=0, align2=0: 1601.25 (-1091.63%) 113.12 ( 15.81%) 134.38 length=512, align1=0, align2=0: 414.38 (-857.40%) 37.03 ( 14.44%) 43.28 length=512, align1=0, align2=0: 419.53 (-890.77%) 35.94 ( 15.13%) 42.34 length=512, align1=0, align2=0: 414.69 (-890.30%) 36.09 ( 13.81%) 41.88 length=4096, align1=0, align2=0: 3172.66 (-1067.63%) 217.66 ( 19.90%) 271.72 length=4096, align1=0, align2=16: 3172.81 (-1084.02%) 209.22 ( 21.92%) 267.97 length=4096, align1=0, align2=0: 3172.34 (-1062.16%) 209.22 ( 23.35%) 272.97 length=4096, align1=0, align2=0: 3172.34 (-1082.47%) 208.59 ( 22.25%) 268.28 length=1024, align1=0, align2=0: 810.00 (-1007.69%) 60.78 ( 16.88%) 73.12 length=1024, align1=0, align2=0: 813.75 (-1015.20%) 60.78 ( 16.70%) 72.97 length=1024, align1=0, align2=0: 808.12 (-1000.43%) 60.62 ( 17.45%) 73.44 length=16, align1=1, align2=2: 27.19 (-104.71%) 12.03 ( 9.41%) 13.28 length=16, align1=1, align2=2: 25.47 (-103.75%) 12.03 ( 3.75%) 12.50 length=16, align1=1, align2=2: 24.84 (-101.27%) 11.88 ( 3.80%) 12.34 length=32, align1=2, align2=4: 38.12 (-183.72%) 12.19 ( 9.30%) 13.44 length=32, align1=2, align2=4: 38.12 (-193.97%) 12.19 ( 6.02%) 12.97 length=32, align1=2, align2=4: 37.34 (-191.46%) 12.34 ( 3.66%) 12.81 length=64, align1=3, align2=6: 72.66 (-121.43%) 15.47 ( 52.86%) 32.81 length=64, align1=3, align2=6: 73.44 (-162.57%) 15.00 ( 46.37%) 27.97 length=64, align1=3, align2=6: 72.97 (-171.51%) 14.69 ( 45.35%) 26.88 length=128, align1=4, align2=8: 118.91 (-428.47%) 20.00 ( 11.11%) 22.50 length=128, align1=4, align2=8: 119.53 (-470.89%) 19.53 ( 6.72%) 20.94 length=128, align1=4, align2=8: 119.69 (-484.73%) 19.53 ( 4.58%) 20.47 length=256, align1=5, align2=10: 221.09 (-567.45%) 30.78 ( 7.08%) 33.12 length=256, align1=5, align2=10: 217.81 (-593.53%) 29.53 ( 5.97%) 31.41 length=256, align1=5, align2=10: 217.19 (-584.73%) 29.38 ( 7.39%) 31.72 length=512, align1=6, align2=12: 416.56 (-733.12%) 43.75 ( 12.50%) 50.00 length=512, align1=6, align2=12: 413.91 (-740.95%) 44.22 ( 10.16%) 49.22 length=512, align1=6, align2=12: 414.22 (-749.68%) 43.75 ( 10.26%) 48.75 length=1024, align1=7, align2=14: 809.53 (-836.89%) 75.94 ( 12.12%) 86.41 length=1024, align1=7, align2=14: 807.81 (-843.43%) 75.94 ( 11.31%) 85.62 length=1024, align1=7, align2=14: 807.66 (-859.00%) 75.62 ( 10.20%) 84.22 --- sysdeps/aarch64/memcmp.S | 12 +- sysdeps/aarch64/multiarch/Makefile | 1 + sysdeps/aarch64/multiarch/ifunc-impl-list.c | 3 + sysdeps/aarch64/multiarch/memcmp.c | 42 +++++++ sysdeps/aarch64/multiarch/memcmp_generic.S | 35 ++++++ sysdeps/aarch64/multiarch/memcmp_kunpeng.S | 187 ++++++++++++++++++++++++++++ 6 files changed, 276 insertions(+), 4 deletions(-) create mode 100644 sysdeps/aarch64/multiarch/memcmp.c create mode 100644 sysdeps/aarch64/multiarch/memcmp_generic.S create mode 100644 sysdeps/aarch64/multiarch/memcmp_kunpeng.S diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index f330154c7a..40ecbddb94 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -25,6 +25,10 @@ * ARMv8-a, AArch64, unaligned accesses. */ +#ifndef MEMCMP +# define MEMCMP memcmp +#endif + /* Parameters and result. */ #define src1 x0 #define src2 x1 @@ -41,7 +45,7 @@ #define tmp1 x7 #define tmp2 x8 -ENTRY_ALIGN (memcmp, 6) +ENTRY_ALIGN (MEMCMP, 6) DELOUSE (0) DELOUSE (1) DELOUSE (2) @@ -148,7 +152,7 @@ L(byte_loop): sub result, data1w, data2w ret -END (memcmp) +END (MEMCMP) #undef bcmp -weak_alias (memcmp, bcmp) -libc_hidden_builtin_def (memcmp) +weak_alias (MEMCMP, bcmp) +libc_hidden_builtin_def (MEMCMP) diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 4150b89a90..eedb8e486d 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -1,6 +1,7 @@ ifeq ($(subdir),string) sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ memcpy_falkor memmove_falkor \ + memcmp_kunpeng memcmp_generic \ memset_generic memset_falkor memset_emag \ memchr_generic memchr_nosimd \ strlen_generic strlen_asimd diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index be13b916e5..fdbc751897 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -57,6 +57,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, memchr, IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_nosimd) IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_generic)) +..IFUNC_IMPL (i, name, memcmp, + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_kunpeng) + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_generic)) IFUNC_IMPL (i, name, strlen, IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd) diff --git a/sysdeps/aarch64/multiarch/memcmp.c b/sysdeps/aarch64/multiarch/memcmp.c new file mode 100644 index 0000000000..276bc1e30e --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcmp.c @@ -0,0 +1,42 @@ +/* Multiple versions of memcmp. AARCH64 version. + Copyright (C) 2017-2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +/* Redefine memcmp so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memcmp +# define memcmp __redirect_memcmp +# include +# include + +extern __typeof (__redirect_memcmp) __libc_memcmp; + +extern __typeof (__redirect_memcmp) __memcmp_generic attribute_hidden; +extern __typeof (__redirect_memcmp) __memcmp_kunpeng attribute_hidden; + +libc_ifunc (__libc_memcmp, + (IS_KUNPENG(midr) + ? __memcmp_kunpeng + : __memcmp_generic)); + + +# undef memcmp +strong_alias (__libc_memcmp, memcmp); +#endif diff --git a/sysdeps/aarch64/multiarch/memcmp_generic.S b/sysdeps/aarch64/multiarch/memcmp_generic.S new file mode 100644 index 0000000000..88bac46075 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcmp_generic.S @@ -0,0 +1,35 @@ +/* A Generic Optimized memcmp implementation for AARCH64. + Copyright (C) 2018-2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + + +#include + +#if IS_IN (libc) +# define MEMCMP __memcmp_generic + +/* Do not hide the generic version of memcmp, we use it internally. */ +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) + +# ifdef SHARED +/* It doesn't make sense to send libc-internal memcmp calls through a PLT. */ + .globl __GI_memcmp; __GI_memcmp = __memcmp_generic +# endif +#endif + +#include "../memcmp.S" \ No newline at end of file diff --git a/sysdeps/aarch64/multiarch/memcmp_kunpeng.S b/sysdeps/aarch64/multiarch/memcmp_kunpeng.S new file mode 100644 index 0000000000..30c937a18d --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcmp_kunpeng.S @@ -0,0 +1,187 @@ +/* Optimized memcmp for Huawei Kunpeng processor. + + Copyright (C) 2013-2019 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + */ + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 + +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data1h x4 +#define data2 x5 +#define data2w w5 +#define data2h x6 +#define tmp1 x7 +#define tmp2 x8 + +#if IS_IN (libc) +#define MEMCMP __memcmp_kunpeng + +ENTRY_ALIGN (MEMCMP, 6) + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + subs limit, limit, 16 + b.lo L(less16) + + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + ccmp data1, data2, 0, ne + ccmp data1h, data2h, 0, eq + b.ne L(return64) + + subs limit, limit, 16 + b.ls L(last_bytes) + cmp limit, 112 + b.lo L(loop16) + + and tmp1, src1, 15 + add limit, limit, tmp1 + sub src1, src1, tmp1 + sub src2, src2, tmp1 + subs limit, limit, 48 + + /* Compare 128 up bytes using aligned access. */ + .p2align 4 +L(loop64): + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + ccmp data1h, data2h, 0, eq + b.ne L(return64) + + ldp data1, data1h, [src1, 16] + ldp data2, data2h, [src2, 16] + cmp data1, data2 + ccmp data1h, data2h, 0, eq + b.ne L(return64) + + ldp data1, data1h, [src1, 32] + ldp data2, data2h, [src2, 32] + cmp data1, data2 + ccmp data1h, data2h, 0, eq + b.ne L(return64) + + ldp data1, data1h, [src1, 48] + ldp data2, data2h, [src2, 48] + cmp data1, data2 + ccmp data1h, data2h, 0, eq + b.ne L(return64) + + subs limit, limit, 64 + add src1, src1, 64 + add src2, src2, 64 + b.pl L(loop64) + adds limit, limit, 48 + b.lo L(last_bytes) + +L(loop16): + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + cmp data1, data2 + ccmp data1h, data2h, 0, eq + b.ne L(return64) + + subs limit, limit, 16 + b.hi L(loop16) + /* Compare last 1-16 bytes using unaligned access. */ +L(last_bytes): + add src1, src1, limit + add src2, src2, limit + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + + /* Compare data bytes and set return value to 0, -1 or 1. */ +L(return64): + cmp data1, data2 + bne L(return) +L(return_pre): + mov data1, data1h + mov data2, data2h +L(return): +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + cmp data1, data2 +L(ret_eq): + cset result, ne + cneg result, result, lo + ret + + .p2align 4 +L(less16): + adds limit, limit, 8 + b.lo L(less8) //lo:< + ldr data1, [src1] + ldr data2, [src2] + /* equal 8 optimized */ + ccmp data1, data2, 0, ne + b.ne L(return) + + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) + + .p2align 4 +L(less8): + adds limit, limit, 4 + b.lo L(less4) + ldr data1w, [src1] + ldr data2w, [src2] + ccmp data1, data2, 0, ne + b.ne L(return) + ldr data1w, [src1, limit] + ldr data2w, [src2, limit] + b L(return) + + .p2align 4 +L(less4): + adds limit, limit, 4 + beq L(ret_0) + +L(byte_loop): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq L(byte_loop) + sub result, data1w, data2w + ret +L(ret_0): + mov result, 0 + ret + +END (MEMCMP) +#undef bcmp +weak_alias (MEMCMP, bcmp) +libc_hidden_builtin_def (MEMCMP) +#endif