From patchwork Tue Jun 21 13:35:37 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: Wilco Dijkstra X-Patchwork-Id: 13283 Received: (qmail 24731 invoked by alias); 21 Jun 2016 13:35:59 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 24715 invoked by uid 89); 21 Jun 2016 13:35:58 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.9 required=5.0 tests=BAYES_00, SPF_PASS autolearn=ham version=3.3.2 spammy=petit, Compensate, Hx-languages-length:5485, Assumptions X-HELO: eu-smtp-delivery-143.mimecast.com From: Wilco Dijkstra To: Marcus Shawcroft CC: nd , 'GNU C Library' Subject: Re: [PATCH][AArch64] Add optimized memchr Date: Tue, 21 Jun 2016 13:35:37 +0000 Message-ID: References: , , In-Reply-To: x-ms-office365-filtering-correlation-id: 5fe43a3d-0c9e-4131-092d-08d399d8ee5d x-microsoft-exchange-diagnostics: 1; DB5PR0801MB1478; 20:fvDbCCATP1XW2DdYIeBeasFNHy0uJt90jqwAdfFcmgDnG1avXqQi86djOx0zoqvbFvUS5xFp6rNXntqKzWsXhtWx5kMHhe0oBGZewh3lOO5pDZhP74RB2TLXFXhmxGsqHfaVcFhrJ4vm0xfLrOh7Dbk4wxgxtGEcBI/30pre49w= x-microsoft-antispam: UriScan:;BCL:0;PCL:0;RULEID:;SRVR:DB5PR0801MB1478; nodisclaimer: True x-microsoft-antispam-prvs: x-exchange-antispam-report-test: UriScan:(180628864354917); x-exchange-antispam-report-cfa-test: BCL:0; PCL:0; RULEID:(102415321)(601004)(2401047)(8121501046)(5005006)(10201501046)(3002001)(6055026); SRVR:DB5PR0801MB1478; BCL:0; PCL:0; RULEID:; SRVR:DB5PR0801MB1478; x-forefront-prvs: 098076C36C x-forefront-antispam-report: SFV:NSPM; SFS:(10009020)(6009001)(7916002)(189002)(377424004)(13464003)(199003)(54534003)(76176999)(99936001)(54356999)(50986999)(101416001)(86362001)(110136002)(5250100002)(3280700002)(66066001)(189998001)(102836003)(3846002)(6116002)(19580395003)(97736004)(7696003)(7736002)(2906002)(5003600100003)(74316001)(81156014)(8676002)(586003)(7846002)(81166006)(19580405001)(8936002)(4326007)(68736007)(9686002)(3660700001)(2950100001)(76576001)(2900100001)(450100001)(11100500001)(87936001)(105586002)(106356001)(33656002)(92566002)(5002640100001); DIR:OUT; SFP:1101; SCL:1; SRVR:DB5PR0801MB1478; H:AM3PR08MB0088.eurprd08.prod.outlook.com; FPR:; SPF:None; PTR:InfoNoRecords; MX:1; A:1; LANG:en; spamdiagnosticoutput: 1:99 spamdiagnosticmetadata: NSPM MIME-Version: 1.0 X-OriginatorOrg: arm.com X-MS-Exchange-CrossTenant-originalarrivaltime: 21 Jun 2016 13:35:37.9811 (UTC) X-MS-Exchange-CrossTenant-fromentityheader: Hosted X-MS-Exchange-CrossTenant-id: f34e5979-57d9-4aaa-ad4d-b122a662184d X-MS-Exchange-Transport-CrossTenantHeadersStamped: DB5PR0801MB1478 X-MC-Unique: kt-lsmwgMfuiIadjouuwuA-1 ping -----Original Message----- From: Wilco Dijkstra [mailto:wdijkstr@arm.com] Sent: 25 September 2015 14:21 To: 'GNU C Library' Subject: [PATCH][AArch64] Add optimized memchr An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests. OK for commit? ChangeLog: 2015-09-25  Wilco Dijkstra  2015-09-25  Kevin Petit          * sysdeps/aarch64/memchr.S (__memchr): New file. --- sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 sysdeps/aarch64/memchr.S diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S new file mode 100644 index 0000000..2f643dd --- /dev/null +++ b/sysdeps/aarch64/memchr.S @@ -0,0 +1,157 @@ +/* memchr - find a character in a memory zone + + Copyright (C) 2015 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 +#define cntin x2 + +#define result x0 + +#define src x3 +#define tmp x4 +#define wtmp2 w5 +#define synd x6 +#define soff x9 +#define cntrem x10 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_chr1 v3 +#define vhas_chr2 v4 +#define vrepmask v5 +#define vend v6 + +/* + * Core algorithm: + * + * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits + * per byte. For each tuple, bit 0 is set if the relevant byte matched the + * requested character and bit 1 is not used (faster than using a 32bit + * syndrome). Since the bits in the syndrome reflect exactly the order in which + * things occur in the original string, counting trailing zeros allows to + * identify exactly which byte has matched. + */ + +ENTRY (__memchr) + /* Do not dereference srcin if no bytes to compare. */ + cbz cntin, L(zero_length) + /* + * Magic constant 0x40100401 allows us to identify which lane matches + * the requested byte. + */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + /* Work with aligned 32-byte chunks */ + bic src, srcin, #31 + dup vrepmask.4s, wtmp2 + ands soff, srcin, #31 + and cntrem, cntin, #31 + b.eq L(loop) + + /* + * Input string is not 32-byte aligned. We calculate the syndrome + * value for the aligned 32 bytes block containing the first bytes + * and mask the irrelevant part. + */ + + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + sub tmp, soff, #32 + adds cntin, cntin, tmp + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Clear the soff*2 lower bits */ + lsl tmp, soff, #1 + lsr synd, synd, tmp + lsl synd, synd, tmp + /* The first block can also be the last */ + b.ls L(masklast) + /* Have we found something already? */ + cbnz synd, L(tail) + +L(loop): + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + subs cntin, cntin, #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + /* If we're out of data we finish regardless of the result */ + b.ls L(end) + /* Use a fast check for the termination condition */ + orr vend.16b, vhas_chr1.16b, vhas_chr2.16b + addp vend.2d, vend.2d, vend.2d + mov synd, vend.2d[0] + /* We're not out of data, loop if we haven't found the character */ + cbz synd, L(loop) + +L(end): + /* Termination condition found, let's calculate the syndrome value */ + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Only do the clear for the last possible block */ + b.hi L(tail) + +L(masklast): + /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ + add tmp, cntrem, soff + and tmp, tmp, #31 + sub tmp, tmp, #32 + neg tmp, tmp, lsl #1 + lsl synd, synd, tmp + lsr synd, synd, tmp + +L(tail): + /* Count the trailing zeros using bit reversing */ + rbit synd, synd + /* Compensate the last post-increment */ + sub src, src, #32 + /* Check that we have found a character */ + cmp synd, #0 + /* And count the leading zeros */ + clz synd, synd + /* Compute the potential result */ + add result, src, synd, lsr #1 + /* Select result or NULL */ + csel result, xzr, result, eq + ret + +L(zero_length): + mov result, #0 + ret +END (__memchr) +weak_alias (__memchr, memchr) +libc_hidden_builtin_def (memchr)