From patchwork Thu Oct 17 14:53:01 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xuelei Zhang X-Patchwork-Id: 35096 Received: (qmail 7321 invoked by alias); 17 Oct 2019 14:53:21 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 7310 invoked by uid 89); 17 Oct 2019 14:53:20 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-18.6 required=5.0 tests=AWL, BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_MANYTO, KAM_SHORT, SPF_HELO_PASS, SPF_PASS autolearn=ham version=3.3.1 spammy=shifting X-HELO: huawei.com From: Xuelei Zhang To: , , , , , Subject: [PATCH] aarch64: Optimized strnlen for Kunpeng processor Date: Thu, 17 Oct 2019 22:53:01 +0800 Message-ID: <20191017145301.6008-1-zhangxuelei4@huawei.com> MIME-Version: 1.0 Optimize the strlen implementation by using vector operations and loop unrooling in main loop. Compared to aarch64/strnlen.S, it reduces latency of cases in bench-strnlen by 11%~24% when the length of src is greater than 64 bytes, with gains throughout the benchmark. Here is the result: simple_strnlen __strnlen_kunpeng __strnlen_generic Length 1, alignment 0: 16.0938 12.3438 11.0938 Length 1, alignment 0: 11.25 12.0312 12.1875 Length 1, alignment 0: 12.9688 12.1875 12.0312 Length 2, alignment 0: 10.9375 12.3438 12.3438 Length 2, alignment 0: 12.5 12.6562 12.8125 Length 2, alignment 0: 14.5312 12.3438 12.1875 Length 3, alignment 0: 12.8125 12.3438 12.1875 Length 3, alignment 0: 13.5938 12.5 12.6562 Length 3, alignment 0: 14.8438 12.3438 12.1875 Length 4, alignment 0: 13.75 12.3438 12.3438 Length 4, alignment 0: 15 12.3438 12.3438 Length 4, alignment 0: 15.7812 12.3438 12.1875 Length 5, alignment 0: 14.2188 12.5 12.0312 Length 5, alignment 0: 15 12.3438 12.3438 Length 5, alignment 0: 16.7188 12.3438 12.5 Length 6, alignment 0: 14.6875 12.1875 12.0312 Length 6, alignment 0: 16.4062 12.3438 12.5 Length 6, alignment 0: 17.3438 12.3438 12.3438 Length 7, alignment 0: 15.4688 12.3438 12.3438 Length 7, alignment 0: 16.5625 12.3438 12.3438 Length 7, alignment 0: 17.5 12.3438 12.3438 Length 1, alignment 1: 10.7812 11.5625 11.0938 Length 1, alignment 1: 10.9375 13.4375 13.5938 Length 1, alignment 1: 12.5 13.4375 13.75 Length 2, alignment 2: 10.9375 13.5938 13.4375 Length 2, alignment 2: 12.6562 13.5938 13.2812 Length 2, alignment 2: 14.375 13.5938 13.9062 Length 3, alignment 3: 12.3438 13.5938 13.4375 Length 3, alignment 3: 24.0625 13.5938 13.5938 Length 3, alignment 3: 15.3125 14.0625 13.2812 Length 4, alignment 4: 23.2812 12.9688 12.6562 Length 4, alignment 4: 15.3125 13.4375 12.9688 Length 4, alignment 4: 16.25 13.125 13.125 Length 5, alignment 5: 14.6875 13.2812 13.125 Length 5, alignment 5: 15.625 13.2812 12.9688 Length 5, alignment 5: 17.3438 13.2812 13.125 Length 6, alignment 6: 14.8438 13.2812 12.8125 Length 6, alignment 6: 16.5625 13.125 12.6562 Length 6, alignment 6: 17.3438 13.125 12.9688 Length 7, alignment 7: 15.4688 13.2812 13.125 Length 7, alignment 7: 17.1875 13.125 13.125 Length 7, alignment 7: 18.5938 13.2812 12.9688 Length 4, alignment 0: 16.0938 12.3438 12.0312 Length 4, alignment 1: 15.3125 13.2812 13.125 Length 8, alignment 0: 19.0625 12.5 12.1875 Length 8, alignment 1: 18.125 13.4375 12.9688 Length 16, alignment 0: 25.3125 13.5938 14.2188 Length 16, alignment 1: 24.5312 14.5312 15.1562 Length 32, alignment 0: 37.3438 14.0625 16.875 Length 32, alignment 1: 36.5625 15.3125 17.5 Length 64, alignment 0: 67.5 17.1875 20.7812 Length 64, alignment 1: 67.6562 17.5 19.8438 Length 128, alignment 0: 117.031 20.4688 23.9062 Length 128, alignment 1: 117.344 22.3438 27.8125 Length 256, alignment 0: 215.312 30 33.9062 Length 256, alignment 1: 215.312 31.25 36.0938 Length 512, alignment 0: 412.031 44.0625 57.8125 Length 512, alignment 1: 412.656 46.5625 58.5938 Length 1024, alignment 0: 806.25 79.8438 102.031 Length 1024, alignment 1: 806.094 79.2188 101.875 Length 1, alignment 0: 12.6562 12.3438 12.3438 Length 2, alignment 0: 14.0625 11.7188 12.3438 Length 3, alignment 0: 14.6875 12.0312 12.0312 Length 4, alignment 0: 15.625 12.1875 11.875 Length 5, alignment 0: 16.25 12.3438 11.875 Length 6, alignment 0: 17.0312 12.3438 12.8125 Length 7, alignment 0: 17.5 12.0312 12.3438 Length 1, alignment 1: 12.5 13.5938 13.9062 Length 2, alignment 2: 13.75 13.4375 13.5938 Length 3, alignment 3: 14.375 13.75 13.4375 Length 4, alignment 4: 15.3125 13.2812 12.8125 Length 5, alignment 5: 16.25 13.125 12.8125 Length 6, alignment 6: 16.7188 13.5938 13.4375 Length 7, alignment 7: 17.6562 13.2812 12.9688 Length 4, alignment 0: 15.3125 12.6562 12.5 Length 4, alignment 1: 15.1562 13.2812 13.2812 Length 8, alignment 0: 18.4375 12.3438 12.6562 Length 8, alignment 1: 18.4375 13.2812 13.125 Length 16, alignment 0: 25 13.4375 14.0625 Length 16, alignment 1: 24.6875 14.0625 15 Length 32, alignment 0: 37.5 13.9062 14.5312 Length 32, alignment 1: 37.0312 14.8438 17.3438 Length 64, alignment 0: 67.8125 17.1875 18.2812 Length 64, alignment 1: 67.8125 17.3438 19.8438 Length 128, alignment 0: 117.031 21.25 23.9062 Length 128, alignment 1: 116.562 21.25 25 Length 256, alignment 0: 215.156 30.3125 34.0625 Length 256, alignment 1: 215.312 31.875 35.1562 Length 512, alignment 0: 411.719 44.2188 59.0625 Length 512, alignment 1: 412.031 46.0938 57.8125 Length 1024, alignment 0: 805.938 77.5 102.344 Length 1024, alignment 1: 805.625 79.5312 102.5 --- sysdeps/aarch64/multiarch/Makefile | 1 + sysdeps/aarch64/multiarch/ifunc-impl-list.c | 4 + sysdeps/aarch64/multiarch/strnlen.c | 37 +++++ sysdeps/aarch64/multiarch/strnlen_generic.S | 40 ++++++ sysdeps/aarch64/multiarch/strnlen_kunpeng.S | 215 ++++++++++++++++++++++++++++ sysdeps/aarch64/strnlen.S | 12 +- 6 files changed, 305 insertions(+), 4 deletions(-) create mode 100644 sysdeps/aarch64/multiarch/strnlen.c create mode 100644 sysdeps/aarch64/multiarch/strnlen_generic.S create mode 100644 sysdeps/aarch64/multiarch/strnlen_kunpeng.S diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 4150b89a90..a9d163d20f 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -3,5 +3,6 @@ sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ memcpy_falkor memmove_falkor \ memset_generic memset_falkor memset_emag \ memchr_generic memchr_nosimd \ + strnlen_generic strnlen_kunpeng \ strlen_generic strlen_asimd endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index be13b916e5..1e253799a5 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -62,5 +62,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd) IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic)) + IFUNC_IMPL (i, name, strnlen, + IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_kunpeng) + IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_generic)) + return i; } diff --git a/sysdeps/aarch64/multiarch/strnlen.c b/sysdeps/aarch64/multiarch/strnlen.c new file mode 100644 index 0000000000..3c832de847 --- /dev/null +++ b/sysdeps/aarch64/multiarch/strnlen.c @@ -0,0 +1,37 @@ +/* Multiple versions of strnlen. AARCH64 version. + Copyright (C) 2019-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +/* Redefine strnlen so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# define strnlen __redirect_strnlen +# define __strnlen __redirect___strnlen +# include +# include + +extern __typeof (__strnlen) __strnlen_generic attribute_hidden; +extern __typeof (__strnlen) __strnlen_kunpeng attribute_hidden; +# undef strnlen +# undef __strnlen + +libc_ifunc_redirected (__redirect___strnlen, __strnlen, + (IS_KUNPENG(midr) ? __strnlen_kunpeng : __strnlen_generic)); + +weak_alias (__strnlen, strnlen); +#endif diff --git a/sysdeps/aarch64/multiarch/strnlen_generic.S b/sysdeps/aarch64/multiarch/strnlen_generic.S new file mode 100644 index 0000000000..4b562bc3dd --- /dev/null +++ b/sysdeps/aarch64/multiarch/strnlen_generic.S @@ -0,0 +1,40 @@ +/* A Generic Optimized strnlen implementation for AARCH64. + Copyright (C) 2018-2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* The actual strnlen code is in ../strnlen.S. If we are building libc this file + defines __strnlen_generic. Otherwise the include of ../strnlen.S will define + the normal __strnlen entry points. */ + +#include + +#if IS_IN (libc) + +# define STRNLEN __strnlen_generic + +/* Do not hide the generic version of strnlen, we use it internally. */ +# undef libc_hidden_def +# define libc_hidden_def(name) + +# ifdef SHARED + .globl __GI_strnlen; __GI_strnlen = STRNLEN + .globl __GI___strnlen; __GI___strnlen = STRNLEN +# endif +#endif + +#include "../strnlen.S" + diff --git a/sysdeps/aarch64/multiarch/strnlen_kunpeng.S b/sysdeps/aarch64/multiarch/strnlen_kunpeng.S new file mode 100644 index 0000000000..a2be5fd1ec --- /dev/null +++ b/sysdeps/aarch64/multiarch/strnlen_kunpeng.S @@ -0,0 +1,215 @@ +/* Optimized strnlen for Huawei Kunpeng processor. + + Copyright (C) 2013-2019 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* Assumptions: + * + * ARMv8-a, AArch64 + */ + +/* Arguments and results. */ +#define srcin x0 +#define len x0 +#define limit x1 + +/* Locals and temporaries. */ +#define src x2 +#define data1 x3 +#define data2 x4 +#define data2a x5 +#define has_nul1 x6 +#define has_nul2 x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define tmp4 x11 +#define zeroones x12 +#define pos x13 +#define limit_wd x14 + +/* NEON register */ +#define dataq q2 +#define datav v2 +#define datab2 b3 +#define dataq2 q3 +#define datav2 v3 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +ENTRY_ALIGN_AND_PAD (__strnlen_kunpeng, 6, 9) + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + cbz limit, L(hit_limit) + mov zeroones, #REP8_01 + bic src, srcin, #15 + ands tmp1, srcin, #15 + b.ne L(misaligned) + /* Calculate the number of full and partial words -1. */ + sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + /* The inner loop deals with two Dwords at a time. This has a + slightly higher start-up cost, but we should win quite quickly, + especially on cores with a high number of issue slots per + cycle, as we get much better parallelism out of the operations. */ + + /* Start of critial section -- keep to one 64Byte cache line. */ + ldp data1, data2, [src], #16 +L(realigned): + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + subs limit_wd, limit_wd, #1 + orr tmp1, has_nul1, has_nul2 + ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ + b.eq L(loop) + /* End of critical section -- keep to one 64Byte cache line. */ + + orr tmp1, has_nul1, has_nul2 + cbz tmp1, L(hit_limit) /* No null in final Qword. */ + + /* We know there's a null in the final Qword. The easiest thing + to do now is work out the length of the string and return + MIN (len, limit). */ + + sub len, src, srcin + cbz has_nul1, L(nul_in_data2) +#ifdef __AARCH64EB__ + mov data2, data1 +#endif + sub len, len, #8 + mov has_nul2, has_nul1 +L(nul_in_data2): +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul directly. The + easiest way to get the correct byte is to byte-swap the data + and calculate the syndrome a second time. */ + rev data2, data2 + sub tmp1, data2, zeroones + orr tmp2, data2, #REP8_7f + bic has_nul2, tmp1, tmp2 +#endif + sub len, len, #8 + rev has_nul2, has_nul2 + clz pos, has_nul2 + add len, len, pos, lsr #3 /* Bits to bytes. */ + cmp len, limit + csel len, len, limit, ls /* Return the lower value. */ + RET + +L(loop): + ldr dataq, [src], #16 + uminv datab2, datav.16b + mov tmp1, datav2.d[0] + subs limit_wd, limit_wd, #1 + ccmp tmp1, #0, #4, pl /* NZCV = 0000 */ + b.eq L(loop_end) + ldr dataq, [src], #16 + uminv datab2, datav.16b + mov tmp1, datav2.d[0] + subs limit_wd, limit_wd, #1 + ccmp tmp1, #0, #4, pl /* NZCV = 0000 */ + b.ne L(loop) +L(loop_end): + /* End of critical section -- keep to one 64Byte cache line. */ + + cbnz tmp1, L(hit_limit) /* No null in final Qword. */ + + /* We know there's a null in the final Qword. The easiest thing + to do now is work out the length of the string and return + MIN (len, limit). */ + +#ifdef __AARCH64EB__ + rev64 datav.16b, datav.16b +#endif + /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a + pair of scalars and then compute the length from the earliest NULL + byte. */ + + cmeq datav.16b, datav.16b, #0 + mov data1, datav.d[0] + mov data2, datav.d[1] + cmp data1, 0 + csel data1, data1, data2, ne + sub len, src, srcin + sub len, len, #16 + rev data1, data1 + add tmp2, len, 8 + clz tmp1, data1 + csel len, len, tmp2, ne + add len, len, tmp1, lsr 3 + cmp len, limit + csel len, len, limit, ls /* Return the lower value. */ + RET + +L(misaligned): + /* Deal with a partial first word. + We're doing two things in parallel here; + 1) Calculate the number of words (but avoiding overflow if + limit is near ULONG_MAX) - to do this we need to work out + limit + tmp1 - 1 as a 65-bit value before shifting it; + 2) Load and mask the initial data words - we force the bytes + before the ones we are interested in to 0xff - this ensures + early bytes will not hit any zero detection. */ + sub limit_wd, limit, #1 + neg tmp4, tmp1 + cmp tmp1, #8 + + and tmp3, limit_wd, #15 + lsr limit_wd, limit_wd, #4 + mov tmp2, #~0 + + ldp data1, data2, [src], #16 + lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ + add tmp3, tmp3, tmp1 + +#ifdef __AARCH64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ +#else + /* Little-endian. Early bytes are at LSB. */ + lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ +#endif + add limit_wd, limit_wd, tmp3, lsr #4 + + orr data1, data1, tmp2 + orr data2a, data2, tmp2 + + csinv data1, data1, xzr, le + csel data2, data2, data2a, le + b L(realigned) + +L(hit_limit): + mov len, limit + RET +END (__strnlen_kunpeng) +weak_alias (__strnlen_kunpeng, strnlen_kunpeng) +libc_hidden_builtin_def (strnlen_kunpeng) diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S index 70283c8074..9a4dfbda15 100644 --- a/sysdeps/aarch64/strnlen.S +++ b/sysdeps/aarch64/strnlen.S @@ -25,6 +25,10 @@ * ARMv8-a, AArch64 */ +#ifndef STRNLEN +# define STRNLEN __strnlen +#endif + /* Arguments and results. */ #define srcin x0 #define len x0 @@ -49,7 +53,7 @@ #define REP8_7f 0x7f7f7f7f7f7f7f7f #define REP8_80 0x8080808080808080 -ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9) +ENTRY_ALIGN_AND_PAD (STRNLEN, 6, 9) DELOUSE (0) DELOUSE (1) DELOUSE (2) @@ -159,7 +163,7 @@ L(misaligned): L(hit_limit): mov len, limit RET -END (__strnlen) -libc_hidden_def (__strnlen) -weak_alias (__strnlen, strnlen) +END (STRNLEN) +libc_hidden_def (STRNLEN) +weak_alias (STRNLEN, strnlen) libc_hidden_def (strnlen)