From patchwork Wed Oct 19 14:58:00 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: Wilco Dijkstra X-Patchwork-Id: 16671 Received: (qmail 61359 invoked by alias); 19 Oct 2016 14:58:16 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 61291 invoked by uid 89); 19 Oct 2016 14:58:15 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.8 required=5.0 tests=AWL, BAYES_00, SPF_PASS autolearn=ham version=3.3.2 spammy=petit, xzr, 1157, Magic X-HELO: eu-smtp-delivery-143.mimecast.com From: Wilco Dijkstra To: "libc-alpha@sourceware.org" , Richard Earnshaw , Szabolcs Nagy , "Marcus Shawcroft" CC: nd Subject: Re: [PATCH][AArch64] Add optimized memchr Date: Wed, 19 Oct 2016 14:58:00 +0000 Message-ID: References: In-Reply-To: x-ms-office365-filtering-correlation-id: 64dd8323-ad05-4cd6-5526-08d3f83051cb x-microsoft-exchange-diagnostics: 1; AM5PR0802MB2612; 7:aMSulxKQP+8wwVRK379u9cyHeZvV6yz4WALCkP83Z7gzL1TUZX28xYqz2crg32964maLqHlXIcCQOE3a6ati5zW4OcwQzSXq6i7DMbTwZmIw05819HITu5CN9bV/YnK4TwICsoohYzb4nxJYwcPu83mkkLZRmFiu68366hQCOAtsMBpweM4HGOPxWNmKd/YerK9xWkT98Uv5leh8goHrSEnpgZaf823lLoriM1AEbxLsTjwb7rYdIXrH3ed2nS6rQ/4uZbbOegEflx2FnYA5boZ3YXcC9jwMCVCThevUWrziBY5AHFsS7ZwkR97J/0HYsyVLe3KIgztsYPHOFX0raCyz9XdCeHUBcQYMjPaWKt8= x-microsoft-antispam: UriScan:;BCL:0;PCL:0;RULEID:;SRVR:AM5PR0802MB2612; nodisclaimer: True x-microsoft-antispam-prvs: x-exchange-antispam-report-test: UriScan:(180628864354917); x-exchange-antispam-report-cfa-test: BCL:0; PCL:0; RULEID:(102415321)(6040176)(601004)(2401047)(8121501046)(5005006)(10201501046)(3002001)(6055026); SRVR:AM5PR0802MB2612; BCL:0; PCL:0; RULEID:; SRVR:AM5PR0802MB2612; x-forefront-prvs: 0100732B76 x-forefront-antispam-report: SFV:NSPM; SFS:(10009020)(6009001)(7916002)(189002)(377424004)(199003)(13464003)(54534003)(2950100002)(6636002)(19580395003)(19580405001)(33656002)(5660300001)(77096005)(5001770100001)(86362001)(7696004)(7846002)(97736004)(76576001)(122556002)(92566002)(5002640100001)(76176999)(54356999)(450100001)(50986999)(305945005)(74316002)(101416001)(2900100001)(81156014)(99936001)(8676002)(81166006)(66066001)(7736002)(3280700002)(6116002)(106356001)(102836003)(3846002)(2501003)(105586002)(586003)(8936002)(189998001)(11100500001)(68736007)(4326007)(9686002)(87936001)(2906002)(3660700001)(10400500002); DIR:OUT; SFP:1101; SCL:1; SRVR:AM5PR0802MB2612; H:AM5PR0802MB2610.eurprd08.prod.outlook.com; FPR:; SPF:None; PTR:InfoNoRecords; MX:1; A:1; LANG:en; spamdiagnosticoutput: 1:99 spamdiagnosticmetadata: NSPM MIME-Version: 1.0 X-OriginatorOrg: arm.com X-MS-Exchange-CrossTenant-originalarrivaltime: 19 Oct 2016 14:58:00.2789 (UTC) X-MS-Exchange-CrossTenant-fromentityheader: Hosted X-MS-Exchange-CrossTenant-id: f34e5979-57d9-4aaa-ad4d-b122a662184d X-MS-Exchange-Transport-CrossTenantHeadersStamped: AM5PR0802MB2612 X-MC-Unique: AS65EIbaPsuY6vZ8WGPO0Q-1 ping -----Original Message----- From: Wilco Dijkstra [mailto:wdijkstr@arm.com] Sent: 25 September 2015 14:21 To: 'GNU C Library' Subject: [PATCH][AArch64] Add optimized memchr An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests. OK for commit? ChangeLog: 2015-09-25  Wilco Dijkstra  2015-09-25  Kevin Petit          * sysdeps/aarch64/memchr.S (__memchr): New file. --- sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 sysdeps/aarch64/memchr.S diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S new file mode 100644 index 0000000..2f643dd --- /dev/null +++ b/sysdeps/aarch64/memchr.S @@ -0,0 +1,157 @@ +/* memchr - find a character in a memory zone + + Copyright (C) 2015 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 +#define cntin x2 + +#define result x0 + +#define src x3 +#define tmp x4 +#define wtmp2 w5 +#define synd x6 +#define soff x9 +#define cntrem x10 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_chr1 v3 +#define vhas_chr2 v4 +#define vrepmask v5 +#define vend v6 + +/* + * Core algorithm: + * + * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits + * per byte. For each tuple, bit 0 is set if the relevant byte matched the + * requested character and bit 1 is not used (faster than using a 32bit + * syndrome). Since the bits in the syndrome reflect exactly the order in which + * things occur in the original string, counting trailing zeros allows to + * identify exactly which byte has matched. + */ + +ENTRY (__memchr) + /* Do not dereference srcin if no bytes to compare. */ + cbz cntin, L(zero_length) + /* + * Magic constant 0x40100401 allows us to identify which lane matches + * the requested byte. + */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + /* Work with aligned 32-byte chunks */ + bic src, srcin, #31 + dup vrepmask.4s, wtmp2 + ands soff, srcin, #31 + and cntrem, cntin, #31 + b.eq L(loop) + + /* + * Input string is not 32-byte aligned. We calculate the syndrome + * value for the aligned 32 bytes block containing the first bytes + * and mask the irrelevant part. + */ + + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + sub tmp, soff, #32 + adds cntin, cntin, tmp + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Clear the soff*2 lower bits */ + lsl tmp, soff, #1 + lsr synd, synd, tmp + lsl synd, synd, tmp + /* The first block can also be the last */ + b.ls L(masklast) + /* Have we found something already? */ + cbnz synd, L(tail) + +L(loop): + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + subs cntin, cntin, #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + /* If we're out of data we finish regardless of the result */ + b.ls L(end) + /* Use a fast check for the termination condition */ + orr vend.16b, vhas_chr1.16b, vhas_chr2.16b + addp vend.2d, vend.2d, vend.2d + mov synd, vend.2d[0] + /* We're not out of data, loop if we haven't found the character */ + cbz synd, L(loop) + +L(end): + /* Termination condition found, let's calculate the syndrome value */ + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Only do the clear for the last possible block */ + b.hi L(tail) + +L(masklast): + /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ + add tmp, cntrem, soff + and tmp, tmp, #31 + sub tmp, tmp, #32 + neg tmp, tmp, lsl #1 + lsl synd, synd, tmp + lsr synd, synd, tmp + +L(tail): + /* Count the trailing zeros using bit reversing */ + rbit synd, synd + /* Compensate the last post-increment */ + sub src, src, #32 + /* Check that we have found a character */ + cmp synd, #0 + /* And count the leading zeros */ + clz synd, synd + /* Compute the potential result */ + add result, src, synd, lsr #1 + /* Select result or NULL */ + csel result, xzr, result, eq + ret + +L(zero_length): + mov result, #0 + ret +END (__memchr) +weak_alias (__memchr, memchr) +libc_hidden_builtin_def (memchr)