From patchwork Wed Jun 11 10:54:59 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Richard Earnshaw X-Patchwork-Id: 1432 Received: (qmail 9358 invoked by alias); 11 Jun 2014 10:55:05 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 9345 invoked by uid 89); 11 Jun 2014 10:55:04 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.3 required=5.0 tests=AWL, BAYES_00, RCVD_IN_DNSWL_LOW, SPF_PASS autolearn=ham version=3.3.2 X-HELO: service87.mimecast.com Message-ID: <53983583.3040606@arm.com> Date: Wed, 11 Jun 2014 11:54:59 +0100 From: Richard Earnshaw User-Agent: Mozilla/5.0 (X11; Linux i686 on x86_64; rv:15.0) Gecko/20120907 Thunderbird/15.0.1 MIME-Version: 1.0 To: libc-alpha@sourceware.org Subject: [PATCH, AArch64] Add optimized strchr. X-MC-Unique: 114061111550000301 Implementation of strchr for AArch64. Speedups taken from micro-bench show the improvements relative to the standard C code. The use of LD1 means we have identical code for both big- and little-endian systems. A speedup (which is taken as the time to run the reference version divided by the time to run the new version) >1 implies an improvement (<1 a regression). The table only shows one sample that regresses and that's by marginally less than 1%. The geomean for all the samples is 1.58. Measurements are on Cortex-A57. Richard Earnshaw * sysdeps/aarch64/strchr.S: New file. OK? Length Alignment Speedup (Ratio: old/new) 32 0 1.11 32 1 1.44 64 0 1.62 64 2 1.63 128 0 2.21 128 3 2.28 256 0 3.17 256 4 2.71 512 0 3.17 512 5 2.68 1024 0 3.37 1024 6 3.25 2048 0 3.5 2048 7 3.59 64 1 1.7 64 1 1.63 64 2 1.64 64 2 1.62 64 3 1.64 64 3 1.62 64 4 1.63 64 4 1.6 64 5 1.64 64 5 1.6 64 6 1.67 64 6 1.61 64 7 1.59 64 7 1.54 0 0 1.05 0 0 1.05 1 0 1.13 1 0 1.07 2 0 1.09 2 0 1.07 3 0 1.14 3 0 1.11 4 0 1.15 4 0 1.1 5 0 1.14 5 0 1.14 6 0 1.15 6 0 1.14 7 0 1.21 7 0 1.18 8 0 1.26 8 0 1.26 9 0 1.37 9 0 1.25 10 0 1.29 10 0 1.25 11 0 1.34 11 0 1.28 12 0 1.35 12 0 1.29 13 0 1.35 13 0 1.32 14 0 1.37 14 0 1.33 15 0 1.4 15 0 1.35 16 0 1.52 16 0 1.36 17 0 1.49 17 0 1.38 18 0 1.41 18 0 1.39 19 0 1.44 19 0 1.41 20 0 1.47 20 0 1.17 21 0 1.24 21 0 1.57 22 0 1.61 22 0 1.53 23 0 1.65 23 0 1.61 24 0 1.72 24 0 1.7 25 0 1.77 25 0 1.65 26 0 1.74 26 0 1.64 27 0 1.88 27 0 1.68 28 0 1.74 28 0 1.69 29 0 1.88 29 0 1.75 30 0 1.79 30 0 1.74 31 0 1.84 31 0 1.8 32 0 1.56 32 1 1.51 64 0 1.74 64 2 1.81 128 0 2.35 128 3 2.59 256 0 3.22 256 4 2.92 512 0 3.09 512 5 2.8 1024 0 3.52 1024 6 3.28 2048 0 3.61 2048 7 3.5 64 1 1.8 64 1 1.73 64 2 1.74 64 2 1.69 64 3 1.75 64 3 1.72 64 4 1.7 64 4 1.68 64 5 1.71 64 5 1.7 64 6 1.73 64 6 1.69 64 7 1.67 64 7 1.65 0 0 1.04 0 0 0.99 1 0 1.11 1 0 1.08 2 0 1.15 2 0 1.12 3 0 1.16 3 0 1.14 4 0 1.17 4 0 1.15 5 0 1.22 5 0 1.19 6 0 1.22 6 0 1.21 7 0 1.24 7 0 1.23 8 0 1.33 8 0 1.33 9 0 1.37 9 0 1.36 10 0 1.4 10 0 1.34 11 0 1.38 11 0 1.39 12 0 1.45 12 0 1.4 13 0 1.52 13 0 1.44 14 0 1.46 14 0 1.43 15 0 1.49 15 0 1.48 16 0 1.49 16 0 1.47 17 0 1.51 17 0 1.47 18 0 1.54 18 0 1.53 19 0 1.6 19 0 1.56 20 0 1.57 20 0 1.58 21 0 1.68 21 0 1.6 22 0 1.59 22 0 1.57 23 0 1.64 23 0 1.62 24 0 1.64 24 0 1.63 25 0 1.71 25 0 1.68 26 0 1.74 26 0 1.66 27 0 1.72 27 0 1.68 28 0 1.77 28 0 1.7 29 0 1.77 29 0 1.74 30 0 1.79 30 0 1.75 31 0 1.82 31 0 1.8 Best improvement 3.61 Worst regression 0.99 Geomean 1.58 diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S new file mode 100644 index 0000000..cdf46ef --- /dev/null +++ b/sysdeps/aarch64/strchr.S @@ -0,0 +1,138 @@ +/* strchr - find a character in a string + + Copyright (C) 2014 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* Assumptions: + * + * ARMv8-a, AArch64 + */ + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 + +#define result x0 + +#define src x2 +#define tmp1 x3 +#define wtmp2 w4 +#define tmp3 x5 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_nul1 v3 +#define vhas_nul2 v4 +#define vhas_chr1 v5 +#define vhas_chr2 v6 +#define vrepmask_0 v7 +#define vrepmask_c v16 +#define vend1 v17 +#define vend2 v18 + + /* Core algorithm. + For each 32-byte hunk we calculate a 64-bit syndrome value, with + two bits per byte (LSB is always in bits 0 and 1, for both big + and little-endian systems). Bit 0 is set iff the relevant byte + matched the requested character. Bit 1 is set iff the + relevant byte matched the NUL end of string (we trigger off bit0 + for the special case of looking for NUL). Since the bits + in the syndrome reflect exactly the order in which things occur + in the original string a count_trailing_zeros() operation will + identify exactly which byte is causing the termination, and why. */ + +/* Locals and temporaries. */ + +ENTRY (strchr) + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + bic src, srcin, #31 + dup vrepmask_c.4s, wtmp2 + ands tmp1, srcin, #31 + add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s // lsl #1 + b.eq L(loop) + + /* Input string is not 32-byte aligned. Rather than forcing + the padding bytes to a safe value, we calculate the syndrome + for all the bytes, but then mask off those bits of the + syndrome that are related to the padding. */ + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + neg tmp1, tmp1 + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_nul2.16b, vdata2.16b, #0 + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b + and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b + orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b + lsl tmp1, tmp1, #1 + addp vend1.16b, vend1.16b, vend2.16b // 256->128 + mov tmp3, #~0 + addp vend1.16b, vend1.16b, vend2.16b // 128->64 + lsr tmp1, tmp3, tmp1 + + mov tmp3, vend1.2d[0] + bic tmp1, tmp3, tmp1 // Mask padding bits. + cbnz tmp1, L(tail) + +L(loop): + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_nul2.16b, vdata2.16b, #0 + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + /* Use a fast check for the termination condition. */ + orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b + orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b + orr vend1.16b, vend1.16b, vend2.16b + addp vend1.2d, vend1.2d, vend1.2d + mov tmp1, vend1.2d[0] + cbz tmp1, L(loop) + + /* Termination condition found. Now need to establish exactly why + we terminated. */ + and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b + and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b + orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b + addp vend1.16b, vend1.16b, vend2.16b // 256->128 + addp vend1.16b, vend1.16b, vend2.16b // 128->64 + + mov tmp1, vend1.2d[0] +L(tail): + sub src, src, #32 + rbit tmp1, tmp1 + clz tmp1, tmp1 + /* Tmp1 is even if the target charager was found first. Otherwise + we've found the end of string and we weren't looking for NUL. */ + tst tmp1, #1 + add result, src, tmp1, lsr #1 + csel result, result, xzr, eq + ret +END (strchr) +libc_hidden_builtin_def (strchr) +weak_alias (strchr, index)