From patchwork Wed Oct 17 08:45:12 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xue Feng X-Patchwork-Id: 29773 Received: (qmail 8038 invoked by alias); 17 Oct 2018 08:46:37 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 6627 invoked by uid 89); 17 Oct 2018 08:45:30 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-24.2 required=5.0 tests=AWL, BAYES_00, FORGED_HOTMAIL_RCVD2, FREEMAIL_FROM, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_PASS, SPF_PASS autolearn=ham version=3.3.2 spammy=8000, Available, 2, 5 X-HELO: APC01-SG2-obe.outbound.protection.outlook.com DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=hotmail.com; s=selector1; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=FQujC9nE1LhxhRe1y0LHCJJ39wDdbYBpM8Rmn8xreEI=; b=aFhaAzohXtNsQQ5bN/FKhKcGvJy5LocJuGP/kX+BModcH3PGc3Rv+IYFxyoAEUxxXpovzCLVOajUmXmKVHOP+ofPa2ywGY7E5IbqXqAVV2i67KjvPAFqtvlYNnLfvx8WY1jKaV4lUPXWSAnYHg3kPfr4spPpTYpHZXPAz9HuVYMccdkun5+J8bC/Bozuzt0QEysE/TyDrCik+HXaRj8TICRRRNxDIM3heIlhnH+oYmRBKgCJPo0/4pbk0QTOIE3K09W6e/v4JQOeUU3fJ4WIGoKGVkHXjNYruY6FtuUJcnLG/i2E2jCV/gGLK6jlNhqDY6Rcp0VVFuB5+kc1bnHrRQ== From: Feng Xue To: "libc-alpha@sourceware.org" CC: "marcus.shawcroft@linaro.org" , "szabolcs.nagy@arm.com" , Richard Henderson , Feng Xue Subject: [PATCH v2 3/3] aarch64: Optimized memchr specific to AmpereComputing skylark Date: Wed, 17 Oct 2018 08:45:12 +0000 Message-ID: MIME-Version: 1.0 Although prefetch load in previous version can benefit performance, it might cause a segfault. Thus, this patch removed that to ensure correct behaviour. Feng --- This version uses general register based memory instruction to load data, because vector register based is slightly slower in skylark. Character-matching is performed on 16-byte (both size and alignment) memory block in parallel each iteration. * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR. [!MEMCHR](MEMCHR): Set to __memchr. * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): Add memchr_generic and memchr_skylark. * sysdeps/aarch64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add memchr ifuncs. * sysdeps/aarch64/multiarch/memchr.c: New file. * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise. * sysdeps/aarch64/multiarch/memchr_skylark.S: Likewise. --- ChangeLog | 12 ++ sysdeps/aarch64/memchr.S | 10 +- sysdeps/aarch64/multiarch/Makefile | 1 + sysdeps/aarch64/multiarch/ifunc-impl-list.c | 3 + sysdeps/aarch64/multiarch/memchr.c | 41 ++++++ sysdeps/aarch64/multiarch/memchr_generic.S | 33 +++++ sysdeps/aarch64/multiarch/memchr_skylark.S | 217 ++++++++++++++++++++++++++++ 7 files changed, 314 insertions(+), 3 deletions(-) create mode 100644 sysdeps/aarch64/multiarch/memchr.c create mode 100644 sysdeps/aarch64/multiarch/memchr_generic.S create mode 100644 sysdeps/aarch64/multiarch/memchr_skylark.S diff --git a/ChangeLog b/ChangeLog index 28370f9..e64b8b3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +2018-10-13 Feng Xue + + * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR. + [!MEMCHR](MEMCHR): Set to __memchr. + * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): + Add memchr_generic and memchr_skylark. + * sysdeps/aarch64/multiarch/ifunc-impl-list.c + (__libc_ifunc_impl_list): Add memchr ifuncs. + * sysdeps/aarch64/multiarch/memchr.c: New file. + * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise. + * sysdeps/aarch64/multiarch/memchr_skylark.S: Likewise. + 2018-10-12 Feng Xue * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S index e422aef..4afebd3 100644 --- a/sysdeps/aarch64/memchr.S +++ b/sysdeps/aarch64/memchr.S @@ -26,6 +26,10 @@ * Neon Available. */ +#ifndef MEMCHR +# define MEMCHR __memchr +#endif + /* Arguments and results. */ #define srcin x0 #define chrin w1 @@ -59,7 +63,7 @@ * identify exactly which byte has matched. */ -ENTRY (__memchr) +ENTRY (MEMCHR) /* Do not dereference srcin if no bytes to compare. */ cbz cntin, L(zero_length) /* @@ -152,6 +156,6 @@ L(tail): L(zero_length): mov result, #0 ret -END (__memchr) -weak_alias (__memchr, memchr) +END (MEMCHR) +weak_alias (MEMCHR, memchr) libc_hidden_builtin_def (memchr) diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 828ce4f..353ece7 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -2,5 +2,6 @@ ifeq ($(subdir),string) sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ memcpy_falkor memmove_falkor \ memset_generic memset_falkor memset_skylark \ + memchr_generic memchr_skylark \ strlen_generic strlen_asimd endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index baf01a0..f5014d2 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -53,6 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor) IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_skylark) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) + IFUNC_IMPL (i, name, memchr, + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_skylark) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_generic)) IFUNC_IMPL (i, name, strlen, IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd) diff --git a/sysdeps/aarch64/multiarch/memchr.c b/sysdeps/aarch64/multiarch/memchr.c new file mode 100644 index 0000000..cbcf8b7 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memchr.c @@ -0,0 +1,41 @@ +/* Multiple versions of memchr. AARCH64 version. + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +/* Redefine memchr so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memchr +# define memchr __redirect_memchr +# include +# include + +extern __typeof (__redirect_memchr) __memchr; + +extern __typeof (__redirect_memchr) __memchr_generic attribute_hidden; +extern __typeof (__redirect_memchr) __memchr_skylark attribute_hidden; + +libc_ifunc (__memchr, + ((IS_SKYLARK (midr) + ? __memchr_skylark + : __memchr_generic))); + +# undef memchr +strong_alias (__memchr, memchr); +#endif diff --git a/sysdeps/aarch64/multiarch/memchr_generic.S b/sysdeps/aarch64/multiarch/memchr_generic.S new file mode 100644 index 0000000..707148b --- /dev/null +++ b/sysdeps/aarch64/multiarch/memchr_generic.S @@ -0,0 +1,33 @@ +/* Memchr for aarch64, default version for internal use. + Copyright (C) 2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#if IS_IN (libc) +# define MEMCHR __memchr_generic + +/* Do not hide the generic version of memchr, we use it internally. */ +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) + +/* Add a hidden definition for use within libc.so. */ +# ifdef SHARED + .globl __GI_memchr; __GI_memchr = __memchr_generic +# endif +#endif + +# include "../memchr.S" diff --git a/sysdeps/aarch64/multiarch/memchr_skylark.S b/sysdeps/aarch64/multiarch/memchr_skylark.S new file mode 100644 index 0000000..f4dbe58 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memchr_skylark.S @@ -0,0 +1,217 @@ +/* Optimized memchr for AmpereComputing skylark processor. + + Copyright (C) 2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +#if IS_IN (libc) +# define MEMCHR __memchr_skylark + +/* Arguments and results. */ +#define srcin x0 +#define chrin x1 +#define cntin x2 + +#define result x0 + +#define repchr x1 + +#define tmp1 x2 +#define tmp2 x3 +#define tmp3 x4 +#define tmp4 x5 + +#define src x6 +#define srcend x7 +#define srcend16 x8 + +#define anymore x9 + +#define zeroones x10 + +#define data1 x11 +#define data2 x12 + +#define has_chr1 x13 +#define has_chr2 x14 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +ENTRY_ALIGN (MEMCHR, 6) + + DELOUSE (0) + DELOUSE (2) + + /* Do not dereference srcin if no bytes to compare. */ + cbz cntin, L(none_chr) + + mov zeroones, REP8_01 + and repchr, chrin, 255 + /* Generate a qword integer as |c|c|c|c|c|c|c|c|. */ + mul repchr, repchr, zeroones + + /* Start address is 16-byte aligned or not? */ + tst srcin, 15 + bic src, srcin, 15 + + add srcend, srcin, cntin + /* + * srcend16 is address of the block following the last block. + * + * [A block is 16-byte aligned and sized.] + */ + add srcend16, srcend, 15 + bic srcend16, srcend16, 15 + + b.eq L(loop) + + /* Load the first block containing start address. */ + ldp data1, data2, [src], 16 + + lsl tmp1, srcin, 3 + mov tmp2, ~0 +#ifdef __AARCH64EB__ + lsr tmp3, tmp2, tmp1 +#else + lsl tmp3, tmp2, tmp1 +#endif + /* Start address is in the first or the second qword? */ + tst srcin, 8 + + /* + * Transform any byte in the block to zero using XOR operation, + * if that byte equals the char to search. In this way, searching + * the char becomes detecting zero in the resulting two qwords. + */ + eor data1, data1, repchr + eor data2, data2, repchr + + /* + * Set those unused bytes(before start address) to 0xff, so + * that they will not hit any zero detection. + */ + orn tmp1, data1, tmp3 + orn tmp2, data2, tmp3 + + csinv data1, tmp1, xzr, eq + csel data2, data2, tmp2, eq + + /* + * When the first and last block are the same, there are two cases: + * o. Memory range to search is just in one block. + * ( first address - last address) < 0 + * + * o. Memory range is so large that last address wrap-around. + * ( first address - last address) > 0 + */ + cmp srcin, srcend + ccmp src, srcend16, 0, mi + csetm anymore, ne + b L(find_chr) + + .p2align 4 +L(loop): + ldp data1, data2, [src], 16 + + subs anymore, src, srcend16 + + /* + * Transform any byte in the block to zero using XOR operation, + * if that byte equals the char to search. + */ + eor data1, data1, repchr + eor data2, data2, repchr + +L(find_chr): + /* + * Use the following integer test to find out if any byte in a + * qword is zero. If do not contain zero-valued byte, test result + * is zero. + * + * (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080 + * = + * (qword - 0x0101010101010101) & ~(qword | 0x7f7f7f7f7f7f7f7f) + * + */ + sub tmp1, data1, zeroones + sub tmp2, data2, zeroones + + orr tmp3, data1, REP8_7f + orr tmp4, data2, REP8_7f + + bic has_chr1, tmp1, tmp3 + bic has_chr2, tmp2, tmp4 + + orr tmp1, has_chr1, has_chr2 + ccmp tmp1, 0, 0, ne + + b.eq L(loop) + + cbz has_chr1, 1f +#ifdef __AARCH64EB__ + rev data1, data1 +#else + rev has_chr1, has_chr1 +#endif + sub result, src, 16 + b L(done) + +1: cbz has_chr2, L(none_chr) +#ifdef __AARCH64EB__ + rev data1, data2 +#else + rev has_chr1, has_chr2 +#endif + sub result, src, 8 + +L(done): +#ifdef __AARCH64EB__ + /* + * For big-endian, can not directly use has_chr1/has_chr2 because + * two qwords has been reversed after loading from memory. + * Thus, have to perform char detection on two qwords again, which + * should be byte-swapped this time. + */ + sub tmp1, data1, zeroones + orr tmp3, data1, REP8_7f + bic has_chr1, tmp1, tmp3 + rev has_chr1, has_chr1 +#endif + + /* + * If the specified char is found in a qword, the corresponding + * byte of in has_chr has value of 1, while this is only true for + * the first occurrence, not other occurrences. + */ + cmp anymore, 0 + clz tmp1, has_chr1 + add result, result, tmp1, lsr 3 + ccmp result, srcend, 8, eq /* NZCV = 8000 */ + csel result, result, xzr, mi + ret + +L(none_chr): + mov result, 0 + ret + +END (MEMCHR) +libc_hidden_builtin_def (MEMCHR) + +#endif