From patchwork Wed Nov 2 16:45:31 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: Wilco Dijkstra X-Patchwork-Id: 17107 Received: (qmail 57370 invoked by alias); 2 Nov 2016 16:45:45 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 57351 invoked by uid 89); 2 Nov 2016 16:45:44 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.9 required=5.0 tests=AWL, BAYES_00, RCVD_IN_DNSWL_NONE, SPF_HELO_PASS, SPF_PASS autolearn=ham version=3.3.2 spammy=bic, petit, ENTRY, lesser X-HELO: EUR01-DB5-obe.outbound.protection.outlook.com From: Wilco Dijkstra To: "libc-alpha@sourceware.org" , Marcus Shawcroft CC: nd , Szabolcs Nagy , Richard Earnshaw Subject: Re: [PATCH][AArch64] Add optimized memchr Date: Wed, 2 Nov 2016 16:45:31 +0000 Message-ID: References: , In-Reply-To: authentication-results: spf=none (sender IP is ) smtp.mailfrom=Wilco.Dijkstra@arm.com; x-ms-office365-filtering-correlation-id: 72fd65dd-62ef-4fde-3363-08d4033fa8a4 x-microsoft-exchange-diagnostics: 1; AM5PR0802MB2612; 7:iDSp4VrmJXy3YQ4gjZm6xM6qqKV+JixUAm5AfvUQn7FpJtGHbLZwpmDJ3LK4Pz/8K4PpCQ0O67mHJllH1pNviNt4ds/f4EoNkAeGg3rygm29EYXTq7k8UT3eoVLOHTiEWabMDBkknh49DvJz3VMzhz+lhChYRzfaDWGbBlO0VYhsToXd4zTU7Qffu4ufJcPr45QZ6YiNNH/J/0cO2API9jo0tp13reczCZYj4SfdcsUtCPdK+zyoAmEuO7Dr5S2sCm19FDouhM1XuHL+t969SVb22gx15zqD8d3qPMx5FKrQDBvqRisT0dYg4m6tAYrGlMF6v0IuNR9R0wNF4kCqdvdjvo8Mch+onAFUh35TNd8= x-microsoft-antispam: UriScan:;BCL:0;PCL:0;RULEID:;SRVR:AM5PR0802MB2612; nodisclaimer: True x-microsoft-antispam-prvs: x-exchange-antispam-report-test: UriScan:(180628864354917); x-exchange-antispam-report-cfa-test: BCL:0; PCL:0; RULEID:(102415321)(6040176)(601004)(2401047)(8121501046)(5005006)(3002001)(10201501046)(6055026); SRVR:AM5PR0802MB2612; BCL:0; PCL:0; RULEID:; SRVR:AM5PR0802MB2612; x-forefront-prvs: 0114FF88F6 x-forefront-antispam-report: SFV:NSPM; SFS:(10009020)(979002)(6009001)(7916002)(377424004)(13464003)(54534003)(199003)(189002)(5660300001)(6636002)(87936001)(3660700001)(106356001)(92566002)(86362001)(5001770100001)(97736004)(2501003)(9686002)(6116002)(3846002)(3280700002)(102836003)(586003)(68736007)(8936002)(19580395003)(11100500001)(105586002)(10400500002)(19580405001)(7696004)(101416001)(50986999)(450100001)(2900100001)(54356999)(76176999)(66066001)(99936001)(77096005)(5002640100001)(2950100002)(33656002)(74316002)(2906002)(189998001)(305945005)(4326007)(81156014)(7736002)(7846002)(122556002)(81166006)(76576001)(8676002)(969003)(989001)(999001)(1009001)(1019001); DIR:OUT; SFP:1101; SCL:1; SRVR:AM5PR0802MB2612; H:AM5PR0802MB2610.eurprd08.prod.outlook.com; FPR:; SPF:None; PTR:InfoNoRecords; A:1; MX:1; LANG:en; received-spf: None (protection.outlook.com: arm.com does not designate permitted sender hosts) spamdiagnosticoutput: 1:99 spamdiagnosticmetadata: NSPM MIME-Version: 1.0 X-OriginatorOrg: arm.com X-MS-Exchange-CrossTenant-originalarrivaltime: 02 Nov 2016 16:45:31.1527 (UTC) X-MS-Exchange-CrossTenant-fromentityheader: Hosted X-MS-Exchange-CrossTenant-id: f34e5979-57d9-4aaa-ad4d-b122a662184d X-MS-Exchange-Transport-CrossTenantHeadersStamped: AM5PR0802MB2612 ping -----Original Message----- From: Wilco Dijkstra [mailto:wdijkstr@arm.com] Sent: 25 September 2015 14:21 To: 'GNU C Library' Subject: [PATCH][AArch64] Add optimized memchr An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests. OK for commit? ChangeLog: 2015-09-25  Wilco Dijkstra  2015-09-25  Kevin Petit          * sysdeps/aarch64/memchr.S (__memchr): New file. --- sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 sysdeps/aarch64/memchr.S diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S new file mode 100644 index 0000000..2f643dd --- /dev/null +++ b/sysdeps/aarch64/memchr.S @@ -0,0 +1,157 @@ +/* memchr - find a character in a memory zone + + Copyright (C) 2015 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 +#define cntin x2 + +#define result x0 + +#define src x3 +#define tmp x4 +#define wtmp2 w5 +#define synd x6 +#define soff x9 +#define cntrem x10 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_chr1 v3 +#define vhas_chr2 v4 +#define vrepmask v5 +#define vend v6 + +/* + * Core algorithm: + * + * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits + * per byte. For each tuple, bit 0 is set if the relevant byte matched the + * requested character and bit 1 is not used (faster than using a 32bit + * syndrome). Since the bits in the syndrome reflect exactly the order in which + * things occur in the original string, counting trailing zeros allows to + * identify exactly which byte has matched. + */ + +ENTRY (__memchr) + /* Do not dereference srcin if no bytes to compare. */ + cbz cntin, L(zero_length) + /* + * Magic constant 0x40100401 allows us to identify which lane matches + * the requested byte. + */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + /* Work with aligned 32-byte chunks */ + bic src, srcin, #31 + dup vrepmask.4s, wtmp2 + ands soff, srcin, #31 + and cntrem, cntin, #31 + b.eq L(loop) + + /* + * Input string is not 32-byte aligned. We calculate the syndrome + * value for the aligned 32 bytes block containing the first bytes + * and mask the irrelevant part. + */ + + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + sub tmp, soff, #32 + adds cntin, cntin, tmp + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Clear the soff*2 lower bits */ + lsl tmp, soff, #1 + lsr synd, synd, tmp + lsl synd, synd, tmp + /* The first block can also be the last */ + b.ls L(masklast) + /* Have we found something already? */ + cbnz synd, L(tail) + +L(loop): + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + subs cntin, cntin, #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + /* If we're out of data we finish regardless of the result */ + b.ls L(end) + /* Use a fast check for the termination condition */ + orr vend.16b, vhas_chr1.16b, vhas_chr2.16b + addp vend.2d, vend.2d, vend.2d + mov synd, vend.2d[0] + /* We're not out of data, loop if we haven't found the character */ + cbz synd, L(loop) + +L(end): + /* Termination condition found, let's calculate the syndrome value */ + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Only do the clear for the last possible block */ + b.hi L(tail) + +L(masklast): + /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ + add tmp, cntrem, soff + and tmp, tmp, #31 + sub tmp, tmp, #32 + neg tmp, tmp, lsl #1 + lsl synd, synd, tmp + lsr synd, synd, tmp + +L(tail): + /* Count the trailing zeros using bit reversing */ + rbit synd, synd + /* Compensate the last post-increment */ + sub src, src, #32 + /* Check that we have found a character */ + cmp synd, #0 + /* And count the leading zeros */ + clz synd, synd + /* Compute the potential result */ + add result, src, synd, lsr #1 + /* Select result or NULL */ + csel result, xzr, result, eq + ret + +L(zero_length): + mov result, #0 + ret +END (__memchr) +weak_alias (__memchr, memchr) +libc_hidden_builtin_def (memchr)