From patchwork Thu May 12 14:02:28 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Wilco Dijkstra X-Patchwork-Id: 12219 Received: (qmail 108549 invoked by alias); 12 May 2016 14:02:46 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 108531 invoked by uid 89); 12 May 2016 14:02:46 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=0.1 required=5.0 tests=BAYES_00, LIKELY_SPAM_SUBJECT, SPF_PASS autolearn=no version=3.3.2 spammy=termination, H*c:HHH X-HELO: eu-smtp-delivery-143.mimecast.com From: Wilco Dijkstra To: 'GNU C Library' , Marcus Shawcroft CC: nd Subject: Fw: [PATCH][AArch64] Add optimized memchr Date: Thu, 12 May 2016 14:02:28 +0000 Message-ID: References: In-Reply-To: x-ms-office365-filtering-correlation-id: bb7144d0-7d3d-4d90-ba79-08d37a6e0dad x-microsoft-exchange-diagnostics: 1; AM3PR08MB0086; 5:5FJlIgnYzgLrbE7aHEk0Ilrig1bXb2BoFv66cT0EtdQJa4nQMhE8UtKOAk0HmPEKxX6znhMTjmriRWr1DO/09QJFmL/BfR/o4jarW5f00fHE5tAZ1r5KBOMX9nVCZwcLtRXZDvOGfRdMD6D2XIb+cQ==; 24:T3yVchXibhVgz+1vpKeH2gQIMYjfpTAGTyWKA6rWBfJfTuc9fnhTwxiHdd45hfFCtyEoFbg3SeBBoR9OAXx5EcdhTo+kImpu9R8IV2/3shg=; 7:WKEGYmXLmUqX9cTeT2HmqojjUvfKpriafGr9KZdrTCDPUQ+5d2nJ2xF267eTUdb6hfeSpG/tFAwPv3uhtaRYlduUG7EFq+2YpidYhgubD3rU3Pg4wYd4BiFMImC/lMfSfZLQaufOvCsvKGRt1bSH4+9dEZ/m47ZvTeE1P5o+1uYBg1WIaoa+7TNQo1fQmYNh; 20:LGh/hwyLCKLOzXuhPafZctnoF2XJpUBnj8HuHs1ie6oz5TC7qevdMSLAwgxJmEGiP61JvQ7KGA8LPHv0xTVWahDbPTLsPbxd9iAbKELjNA7tNj87SawvS2upRyJKmi/NuLQryvbh3ewRCIxm2q1Ry7OgQ/CFrsYl2r0u28fAEYY= x-microsoft-antispam: UriScan:;BCL:0;PCL:0;RULEID:;SRVR:AM3PR08MB0086; nodisclaimer: True x-microsoft-antispam-prvs: x-exchange-antispam-report-test: UriScan:; x-exchange-antispam-report-cfa-test: BCL:0; PCL:0; RULEID:(102415293)(102615271)(601004)(2401047)(5005006)(8121501046)(3002001)(10201501046)(6055026); SRVR:AM3PR08MB0086; BCL:0; PCL:0; RULEID:; SRVR:AM3PR08MB0086; x-forefront-prvs: 0940A19703 x-forefront-antispam-report: SFV:NSPM; SFS:(10009020)(6009001)(377424004)(13464003)(54534003)(189002)(199003)(106356001)(5008740100001)(450100001)(9686002)(68736007)(76576001)(5004730100002)(66066001)(1220700001)(189998001)(5002640100001)(19580405001)(19580395003)(87936001)(33656002)(5003600100002)(4326007)(81166006)(81156014)(54356999)(76176999)(2906002)(50986999)(99936001)(101416001)(2950100001)(2900100001)(586003)(3280700002)(3660700001)(92566002)(5250100002)(105586002)(74316001)(8936002)(102836003)(6116002)(97736004)(86362001)(3846002)(5001770100001); DIR:OUT; SFP:1101; SCL:1; SRVR:AM3PR08MB0086; H:AM3PR08MB0088.eurprd08.prod.outlook.com; FPR:; SPF:None; PTR:InfoNoRecords; MX:1; A:1; LANG:en; spamdiagnosticoutput: 1:23 spamdiagnosticmetadata: NSPM MIME-Version: 1.0 X-OriginatorOrg: arm.com X-MS-Exchange-CrossTenant-originalarrivaltime: 12 May 2016 14:02:28.2914 (UTC) X-MS-Exchange-CrossTenant-fromentityheader: Hosted X-MS-Exchange-CrossTenant-id: f34e5979-57d9-4aaa-ad4d-b122a662184d X-MS-Exchange-Transport-CrossTenantHeadersStamped: AM3PR08MB0086 X-MC-Unique: 0hMO0CTIT-WvDRMHXTNliQ-1 ping -----Original Message----- From: Wilco Dijkstra [mailto:wdijkstr@arm.com] Sent: 25 September 2015 14:21 To: 'GNU C Library' Subject: [PATCH][AArch64] Add optimized memchr An optimized memchr was missing for AArch64. This version is similar to strchr and is significantly faster than the C version. Passes GLIBC tests. OK for commit? ChangeLog: 2015-09-25 Wilco Dijkstra 2015-09-25 Kevin Petit * sysdeps/aarch64/memchr.S (__memchr): New file. --- sysdeps/aarch64/memchr.S | 157 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 sysdeps/aarch64/memchr.S diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S new file mode 100644 index 0000000..2f643dd --- /dev/null +++ b/sysdeps/aarch64/memchr.S @@ -0,0 +1,157 @@ +/* memchr - find a character in a memory zone + + Copyright (C) 2015 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 +#define cntin x2 + +#define result x0 + +#define src x3 +#define tmp x4 +#define wtmp2 w5 +#define synd x6 +#define soff x9 +#define cntrem x10 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_chr1 v3 +#define vhas_chr2 v4 +#define vrepmask v5 +#define vend v6 + +/* + * Core algorithm: + * + * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits + * per byte. For each tuple, bit 0 is set if the relevant byte matched the + * requested character and bit 1 is not used (faster than using a 32bit + * syndrome). Since the bits in the syndrome reflect exactly the order in which + * things occur in the original string, counting trailing zeros allows to + * identify exactly which byte has matched. + */ + +ENTRY (__memchr) + /* Do not dereference srcin if no bytes to compare. */ + cbz cntin, L(zero_length) + /* + * Magic constant 0x40100401 allows us to identify which lane matches + * the requested byte. + */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + /* Work with aligned 32-byte chunks */ + bic src, srcin, #31 + dup vrepmask.4s, wtmp2 + ands soff, srcin, #31 + and cntrem, cntin, #31 + b.eq L(loop) + + /* + * Input string is not 32-byte aligned. We calculate the syndrome + * value for the aligned 32 bytes block containing the first bytes + * and mask the irrelevant part. + */ + + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + sub tmp, soff, #32 + adds cntin, cntin, tmp + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Clear the soff*2 lower bits */ + lsl tmp, soff, #1 + lsr synd, synd, tmp + lsl synd, synd, tmp + /* The first block can also be the last */ + b.ls L(masklast) + /* Have we found something already? */ + cbnz synd, L(tail) + +L(loop): + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + subs cntin, cntin, #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + /* If we're out of data we finish regardless of the result */ + b.ls L(end) + /* Use a fast check for the termination condition */ + orr vend.16b, vhas_chr1.16b, vhas_chr2.16b + addp vend.2d, vend.2d, vend.2d + mov synd, vend.2d[0] + /* We're not out of data, loop if we haven't found the character */ + cbz synd, L(loop) + +L(end): + /* Termination condition found, let's calculate the syndrome value */ + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ + addp vend.16b, vend.16b, vend.16b /* 128->64 */ + mov synd, vend.2d[0] + /* Only do the clear for the last possible block */ + b.hi L(tail) + +L(masklast): + /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ + add tmp, cntrem, soff + and tmp, tmp, #31 + sub tmp, tmp, #32 + neg tmp, tmp, lsl #1 + lsl synd, synd, tmp + lsr synd, synd, tmp + +L(tail): + /* Count the trailing zeros using bit reversing */ + rbit synd, synd + /* Compensate the last post-increment */ + sub src, src, #32 + /* Check that we have found a character */ + cmp synd, #0 + /* And count the leading zeros */ + clz synd, synd + /* Compute the potential result */ + add result, src, synd, lsr #1 + /* Select result or NULL */ + csel result, xzr, result, eq + ret + +L(zero_length): + mov result, #0 + ret +END (__memchr) +weak_alias (__memchr, memchr) +libc_hidden_builtin_def (memchr)