From patchwork Wed Jun 3 09:53:11 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Andrea Corallo X-Patchwork-Id: 39433 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 5C2093851C27; Wed, 3 Jun 2020 09:53:28 +0000 (GMT) X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from EUR01-VE1-obe.outbound.protection.outlook.com (mail-eopbgr140089.outbound.protection.outlook.com [40.107.14.89]) by sourceware.org (Postfix) with ESMTPS id 319C9385DC00 for ; Wed, 3 Jun 2020 09:53:24 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 319C9385DC00 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=arm.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=Andrea.Corallo@arm.com DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=armh.onmicrosoft.com; s=selector2-armh-onmicrosoft-com; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=X2GnJjJiCJOe8kQ70KTvpaDqNtq/4LhsKweyhD9hao0=; b=FjiR8PNw74ii7sL/5K9wwo6UQFsqoYY+/GvwsjGW8tBECvkkXNCVDrGSTDlbQvsb69rHdXW0gyMhuQuZEghvGgtCV3Ai4jiO2pIMZELhwLjLvB+67QTTsa8qw5b2QYBeith0aslIuclXanMu4kzIqyI6lxZ1mEoYeexFnI0MA04= Received: from DB8P191CA0027.EURP191.PROD.OUTLOOK.COM (2603:10a6:10:130::37) by VE1PR08MB4766.eurprd08.prod.outlook.com (2603:10a6:802:a9::18) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.3045.22; Wed, 3 Jun 2020 09:53:22 +0000 Received: from DB5EUR03FT061.eop-EUR03.prod.protection.outlook.com (2603:10a6:10:130:cafe::3b) by DB8P191CA0027.outlook.office365.com (2603:10a6:10:130::37) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.3066.18 via Frontend Transport; Wed, 3 Jun 2020 09:53:22 +0000 X-MS-Exchange-Authentication-Results: spf=pass (sender IP is 63.35.35.123) smtp.mailfrom=arm.com; sourceware.org; dkim=pass (signature was verified) header.d=armh.onmicrosoft.com; sourceware.org; dmarc=bestguesspass action=none header.from=arm.com; Received-SPF: Pass (protection.outlook.com: domain of arm.com designates 63.35.35.123 as permitted sender) receiver=protection.outlook.com; client-ip=63.35.35.123; helo=64aa7808-outbound-1.mta.getcheckrecipient.com; Received: from 64aa7808-outbound-1.mta.getcheckrecipient.com (63.35.35.123) by DB5EUR03FT061.mail.protection.outlook.com (10.152.21.234) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.3066.18 via Frontend Transport; Wed, 3 Jun 2020 09:53:21 +0000 Received: ("Tessian outbound 952576a3272a:v57"); Wed, 03 Jun 2020 09:53:21 +0000 X-CheckRecipientChecked: true X-CR-MTA-CID: d2d525db0f209a7d X-CR-MTA-TID: 64aa7808 Received: from e76423147844.1 by 64aa7808-outbound-1.mta.getcheckrecipient.com id B369CA2D-BA3C-4422-9D59-EE94F79489B6.1; Wed, 03 Jun 2020 09:53:16 +0000 Received: from EUR02-AM5-obe.outbound.protection.outlook.com by 64aa7808-outbound-1.mta.getcheckrecipient.com with ESMTPS id e76423147844.1 (version=TLSv1.2 cipher=ECDHE-RSA-AES256-GCM-SHA384); Wed, 03 Jun 2020 09:53:16 +0000 ARC-Seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=oaHhC0abzhkYaBDTaLJv5LwWOPxdgtJ3IGt4rElDlFd5tpb/XRIOoFBCGjICoyEI1vFIJKO4emlCdhd/Zo2ANv0lav7jTmir94guq62NRY2QFNi7+cidooBkp8kUAZl6yKIB8YiJfKlqCjKmn/9iiwcQT2/HpLS7QszS726xTDcVec1eidkHL1zAtLIenpsxMmrcRuOHl8JXRNr9meaWj98drNW7E6ucUP3nDLHTGwHvTd8rku1lx4t4+vZT8vOLhSYNqCxvmTHVV5Qr5QsudHvtZw1RNHv07pJoPeQDxITrlS0DjaviKEvC486zL6fVkE6UfWBoc8REgRmN5UZ4gg== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=X2GnJjJiCJOe8kQ70KTvpaDqNtq/4LhsKweyhD9hao0=; b=OkcXcXVLPe5ObiyJ6xgvz1O6wOS7urCHj7XbcveCxmQ0sZJTFFs3VXsgpfdvBzDkK5gr2MSHvVZ1iWy/BoRNlciVZ/HKqYOWUYI8KWE/Qe5NDlorae62OXKsruMxCex73cZLZ0y34jVZCkS16rxyLG/+tDpas2tE3HSvFUYx9XXTLVeXjw7RMo0lm7OVc6a41YhtDRj48XBufmp8eulAQE9PFRKN+arx7FSE0KvJD3lZ9E+uAX9VYTgdT1mD4dcKoahyS3wsuRwXUadOK6tCiSFdBtuzb3XqkdR8//SIpGT5W2JbPzL4tT+nHdpxg9gJJ+SNqF1Udn6+hgyNvbczXg== ARC-Authentication-Results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass header.d=arm.com; arc=none DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=armh.onmicrosoft.com; s=selector2-armh-onmicrosoft-com; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=X2GnJjJiCJOe8kQ70KTvpaDqNtq/4LhsKweyhD9hao0=; b=FjiR8PNw74ii7sL/5K9wwo6UQFsqoYY+/GvwsjGW8tBECvkkXNCVDrGSTDlbQvsb69rHdXW0gyMhuQuZEghvGgtCV3Ai4jiO2pIMZELhwLjLvB+67QTTsa8qw5b2QYBeith0aslIuclXanMu4kzIqyI6lxZ1mEoYeexFnI0MA04= Authentication-Results-Original: arm.com; dkim=none (message not signed) header.d=none;arm.com; dmarc=none action=none header.from=arm.com; Received: from DB7PR08MB3594.eurprd08.prod.outlook.com (2603:10a6:10:4e::11) by DB7PR08MB3338.eurprd08.prod.outlook.com (2603:10a6:5:1b::22) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.3066.18; Wed, 3 Jun 2020 09:53:15 +0000 Received: from DB7PR08MB3594.eurprd08.prod.outlook.com ([fe80::5447:f1c0:97c:aa35]) by DB7PR08MB3594.eurprd08.prod.outlook.com ([fe80::5447:f1c0:97c:aa35%7]) with mapi id 15.20.3045.024; Wed, 3 Jun 2020 09:53:15 +0000 From: Andrea Corallo To: libc-alpha@sourceware.org Subject: [PATCH] aarch64: MTE compatible strlen Date: Wed, 03 Jun 2020 11:53:11 +0200 Message-ID: X-ClientProxiedBy: LO2P265CA0055.GBRP265.PROD.OUTLOOK.COM (2603:10a6:600:60::19) To DB7PR08MB3594.eurprd08.prod.outlook.com (2603:10a6:10:4e::11) MIME-Version: 1.0 X-MS-Exchange-MessageSentRepresentingType: 1 Received: from e112547 (217.140.96.140) by LO2P265CA0055.GBRP265.PROD.OUTLOOK.COM (2603:10a6:600:60::19) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.3066.18 via Frontend Transport; Wed, 3 Jun 2020 09:53:13 +0000 X-Originating-IP: [217.140.96.140] X-MS-PublicTrafficType: Email X-MS-Office365-Filtering-HT: Tenant X-MS-Office365-Filtering-Correlation-Id: 11bde3aa-3e46-4671-24ed-08d807a3f387 X-MS-TrafficTypeDiagnostic: DB7PR08MB3338:|VE1PR08MB4766: X-MS-Exchange-Transport-Forked: True X-Microsoft-Antispam-PRVS: x-checkrecipientrouted: true NoDisclaimer: true X-MS-Oob-TLC-OOBClassifiers: OLM:10000;OLM:10000; X-Forefront-PRVS: 04238CD941 X-MS-Exchange-SenderADCheck: 1 X-Microsoft-Antispam-Untrusted: BCL:0; X-Microsoft-Antispam-Message-Info-Original: moIA8LqfG3n1vSJE7e6o50E2p92w3PxsUiG/N+KO9wFjO2mpzXHO4szlv37DUDYC1u3OHUB7vl9e81l462FkLObzxwdXZ8te3HCAggRL9Co7vDnXvCxyWBhS0GE64bESSFyq3T5a5L88svD0p/pRgYLWVJ78SCSCmiuQcnbQxsyX17Ep8DDq3skIsnu9l4vt4DpRi3tWOPPsbo3n+W3fz9xn5sb7MJcC+RzrluAT3P3p9c5FO1oJuezxFjgKQz0+9I8rQqXAnGbrucHvPztHomBk5BKn6krE2cS3G06u7wlmX0NWv5+XuPnCLq5HyGoq X-Forefront-Antispam-Report-Untrusted: CIP:255.255.255.255; CTRY:; LANG:en; SCL:1; SRV:; IPV:NLI; SFV:NSPM; H:DB7PR08MB3594.eurprd08.prod.outlook.com; PTR:; CAT:NONE; SFTY:; SFS:(4636009)(366004)(376002)(346002)(396003)(136003)(39860400002)(66946007)(8936002)(5660300002)(235185007)(8676002)(6486002)(478600001)(36756003)(2906002)(4326008)(66616009)(66476007)(66556008)(6916009)(16526019)(44832011)(316002)(86362001)(6496006)(52116002)(956004)(2616005)(186003)(26005); DIR:OUT; SFP:1101; X-MS-Exchange-AntiSpam-MessageData: Z+RRFIyQ25eg93Pk2I5AUBJxk7joPqH+F/ycnqQa62Wr0nomiMzBisx5K8LhRSu1k1L5CK9au573/a3ia/Vd6SBlihdt4budWoJCqMzhnoV9DzbA8fLcgL+2SCQukprIADLyE7oc1l6jrsfh1vOHk5cp+VafiH/4xn0HBz3bhVhEWtpbxXWFNXFXOT39MIIr15VbB3qkSxg6jLOBG5b9cpAFec9pZPFMC17gRnwtROjzG5kkR9VESGwdpSmzkCKFqbEnoxUMaUkswrjIcbARKv8J7y/4DfWbhtUIVXf7uMV+zkui/zWA1N9rGG284rq6e7dmHGVF8jlOv4hxz1l3SGqHgTP6HdcYcrOkAgWV9o3RnTdq7zWLYq67znJ4H4/wiZCCBRD364QZhqAvM66kouunTk/wKEZQbgXczJRvfhkZwtoX9OgQTA44PlY6e3j/OlFePKDcU/uIZypoCioqmRcSSl8t5hzR5Yv7wvVZBPzAMz+d/QNT0CprPucDPplC X-MS-Exchange-Transport-CrossTenantHeadersStamped: DB7PR08MB3338 Original-Authentication-Results: arm.com; dkim=none (message not signed) header.d=none;arm.com; dmarc=none action=none header.from=arm.com; X-EOPAttributedMessage: 0 X-MS-Exchange-Transport-CrossTenantHeadersStripped: DB5EUR03FT061.eop-EUR03.prod.protection.outlook.com X-Forefront-Antispam-Report: CIP:63.35.35.123; CTRY:IE; LANG:en; SCL:1; SRV:; IPV:CAL; SFV:NSPM; H:64aa7808-outbound-1.mta.getcheckrecipient.com; PTR:ec2-63-35-35-123.eu-west-1.compute.amazonaws.com; CAT:NONE; SFTY:; SFS:(4636009)(136003)(396003)(39860400002)(346002)(376002)(46966005)(6916009)(8676002)(8936002)(235185007)(81166007)(356005)(82740400003)(47076004)(26005)(2906002)(82310400002)(6486002)(186003)(956004)(4326008)(16526019)(44832011)(5660300002)(70206006)(336012)(86362001)(36756003)(70586007)(66616009)(2616005)(316002)(6496006)(478600001); DIR:OUT; SFP:1101; X-MS-Office365-Filtering-Correlation-Id-Prvs: 62cee417-71e8-42f8-2732-08d807a3ef44 X-Forefront-PRVS: 04238CD941 X-Microsoft-Antispam: BCL:0; X-Microsoft-Antispam-Message-Info: i8u1hvP3/nxuypijFaVWbftrD0COf3CGQKHrVzJF2dPC4FNJ2Mw3Mvjn/KbgYax55s/nwaQMjEdoo9JWKznN5+MmOI5K3MetklptXRgvoqTq+OSrXbeY445RV1Fdb7AQQv24d2ib+cQ1Mmp+JC0h8kO37K9HCmhn5iLBzv1smZOBTmTQkXwSm2ZF4ufCSvVJ4J/3VGXepl2sE/HQ8Ib1FYFudNJFWww8GixZC0gIQWa1mMwexbuRoH6PWc8peXG9ZrBlImdaOaCEWonb9NR0ukcnhD3AjKtszcUYxd6xto9hwLiVZR4V17YNMAeAvRwyyNlcd/IGh71REgwu3QlXeRNSTVYxXlx1VHlMA4y+VNFuei5DR8w9HkdmdozK/c3+U9ctYeeAmy2PTfF82eYhog== X-OriginatorOrg: arm.com X-MS-Exchange-CrossTenant-OriginalArrivalTime: 03 Jun 2020 09:53:21.9629 (UTC) X-MS-Exchange-CrossTenant-Network-Message-Id: 11bde3aa-3e46-4671-24ed-08d807a3f387 X-MS-Exchange-CrossTenant-Id: f34e5979-57d9-4aaa-ad4d-b122a662184d X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp: TenantId=f34e5979-57d9-4aaa-ad4d-b122a662184d; Ip=[63.35.35.123]; Helo=[64aa7808-outbound-1.mta.getcheckrecipient.com] X-MS-Exchange-CrossTenant-FromEntityHeader: HybridOnPrem X-MS-Exchange-Transport-CrossTenantHeadersStamped: VE1PR08MB4766 X-Spam-Status: No, score=-18.7 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, GIT_PATCH_0, MSGID_FROM_MTA_HEADER, RCVD_IN_DNSWL_LOW, RCVD_IN_MSPIKE_H2, SPF_HELO_PASS, SPF_PASS, TXREP, UNPARSEABLE_RELAY autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Cc: nd@arm.com, Wilco.Dijkstra@arm.com Errors-To: libc-alpha-bounces@sourceware.org Sender: "Libc-alpha" Hi all, I'd like to submit this patch introducing an Arm MTE compatible strlen implementation. Follows a performance comparison of the strlen benchmark run on Cortex-A72, Cortex-A53, Neoverse N1. | length | alignment | perf-uplift A72 | perf-uplift A53 |perf-uplift | |--------+-----------+-----------------+-----------------|------------| | 1 | 1 | 1.00x | 0.96x | 1.13x | | 1 | 0 | 2.15x | 0.96x | 1.00x | | 2 | 2 | 1.16x | 0.95x | 1.09x | | 2 | 0 | 1.17x | 0.93x | 1.00x | | 3 | 3 | 1.30x | 0.95x | 1.09x | | 3 | 0 | 1.32x | 0.96x | 1.00x | | 4 | 4 | 1.14x | 0.87x | 0.99x | | 4 | 0 | 1.14x | 0.96x | 1.00x | | 5 | 5 | 1.15x | 0.89x | 1.09x | | 5 | 0 | 1.19x | 0.96x | 1.00x | | 6 | 6 | 1.14x | 0.96x | 1.39x | | 6 | 0 | 1.14x | 0.95x | 1.00x | | 7 | 7 | 1.03x | 0.90x | 1.09x | | 7 | 0 | 1.14x | 0.95x | 1.27x | | 4 | 0 | 1.15x | 0.87x | 1.00x | | 4 | 7 | 1.15x | 0.96x | 1.10x | | 4 | 2 | 1.27x | 0.95x | 1.39x | | 2 | 2 | 1.14x | 0.96x | 1.09x | | 8 | 0 | 1.15x | 0.96x | 1.00x | | 8 | 7 | 1.14x | 0.96x | 1.09x | | 8 | 3 | 1.17x | 0.96x | 1.39x | | 5 | 3 | 1.14x | 0.96x | 1.39x | | 16 | 0 | 1.15x | 0.83x | 1.48x | | 16 | 7 | 1.14x | 0.80x | 1.43x | | 16 | 4 | 1.15x | 0.83x | 1.48x | | 10 | 4 | 1.15x | 0.96x | 1.27x | | 32 | 0 | 1.04x | 0.88x | 1.16x | | 32 | 7 | 1.02x | 0.84x | 1.19x | | 32 | 5 | 1.04x | 0.84x | 1.23x | | 21 | 5 | 1.14x | 0.83x | 1.60x | | 64 | 0 | 1.17x | 0.80x | 1.75x | | 64 | 7 | 1.17x | 0.77x | 1.83x | | 64 | 6 | 1.17x | 0.77x | 1.57x | | 42 | 6 | 1.00x | 0.80x | 1.42x | | 128 | 0 | 0.96x | 0.68x | 1.80x | | 128 | 7 | 0.96x | 0.66x | 1.85x | | 128 | 7 | 0.96x | 0.67x | 1.86x | | 85 | 7 | 1.05x | 0.75x | 1.87x | | 256 | 0 | 0.98x | 0.69x | 1.88x | | 256 | 7 | 0.98x | 0.68x | 1.92x | | 256 | 8 | 0.99x | 0.69x | 1.88x | | 170 | 8 | 0.96x | 0.72x | 1.86x | | 512 | 0 | 0.99x | 0.65x | 1.90x | | 512 | 7 | 0.98x | 0.65x | 1.92x | | 512 | 9 | 0.99x | 0.65x | 1.92x | | 341 | 9 | 0.98x | 0.68x | 1.99x | | 1024 | 0 | 0.99x | 0.63x | 1.90x | | 1024 | 7 | 0.99x | 0.62x | 1.92x | | 1024 | 10 | 0.99x | 0.62x | 1.92x | | 682 | 10 | 0.99x | 0.64x | 1.96x | | 2048 | 0 | 0.99x | 0.61x | 1.92x | | 2048 | 7 | 1.01x | 0.61x | 1.93x | | 2048 | 11 | 1.00x | 0.61x | 1.95x | | 1365 | 11 | 1.00x | 0.62x | 1.94x | | 4096 | 0 | 1.00x | 0.61x | 1.93x | | 4096 | 7 | 1.00x | 0.61x | 1.94x | | 4096 | 12 | 1.00x | 0.61x | 1.95x | | 2730 | 12 | 1.00x | 0.61x | 1.94x | This patch is passing GLIBC tests. Regards Andrea 8< --- 8< --- 8< Introduce an Arm MTE compatible strlen implementation. Benchmarked on Cortex-A72, Cortex-A53, Neoverse N1 does not show performance regressions. Co-authored-by: Wilco Dijkstra diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S index e01fab7c2a..e314fffed6 100644 --- a/sysdeps/aarch64/strlen.S +++ b/sysdeps/aarch64/strlen.S @@ -20,205 +20,78 @@ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. */ #ifndef STRLEN # define STRLEN __strlen #endif -/* To test the page crossing code path more thoroughly, compile with - -DTEST_PAGE_CROSS - this will force all calls through the slower - entry path. This option is not intended for production use. */ - -/* Arguments and results. */ #define srcin x0 -#define len x0 +#define result x0 -/* Locals and temporaries. */ #define src x1 -#define data1 x2 -#define data2 x3 -#define has_nul1 x4 -#define has_nul2 x5 -#define tmp1 x4 -#define tmp2 x5 -#define tmp3 x6 -#define tmp4 x7 -#define zeroones x8 - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. A faster check - (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives - false hits for characters 129..255. */ - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - -#ifdef TEST_PAGE_CROSS -# define MIN_PAGE_SIZE 16 -#else -# define MIN_PAGE_SIZE 4096 -#endif - - /* Since strings are short on average, we check the first 16 bytes - of the string for a NUL character. In order to do an unaligned ldp - safely we have to do a page cross check first. If there is a NUL - byte we calculate the length from the 2 8-byte words using - conditional select to reduce branch mispredictions (it is unlikely - strlen will be repeatedly called on strings with the same length). - - If the string is longer than 16 bytes, we align src so don't need - further page cross checks, and process 32 bytes per iteration - using the fast NUL check. If we encounter non-ASCII characters, - fallback to a second loop using the full NUL check. - - If the page cross check fails, we read 16 bytes from an aligned - address, remove any characters before the string, and continue - in the main loop using aligned loads. Since strings crossing a - page in the first 16 bytes are rare (probability of - 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. - - AArch64 systems have a minimum page size of 4k. We don't bother - checking for larger page sizes - the cost of setting up the correct - page size is just not worth the extra gain from a small reduction in - the cases taking the slow path. Note that we only care about - whether the first fetch, which may be misaligned, crosses a page - boundary. */ - -ENTRY_ALIGN (STRLEN, 6) +#define synd x2 +#define tmp x3 +#define wtmp w3 +#define shift x4 + +#define data q0 +#define vdata v0 +#define vhas_nul v1 +#define vrepmask v2 +#define vend v3 +#define dend d3 + +/* Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (STRLEN) DELOUSE (0) DELOUSE (1) - and tmp1, srcin, MIN_PAGE_SIZE - 1 - mov zeroones, REP8_01 - cmp tmp1, MIN_PAGE_SIZE - 16 - b.gt L(page_cross) - ldp data1, data2, [srcin] -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul1/2 directly. - Since we expect strings to be small and early-exit, - byte-swap the data now so has_null1/2 will be correct. */ - rev data1, data1 - rev data2, data2 -#endif - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - beq L(main_loop_entry) - - /* Enter with C = has_nul1 == 0. */ - csel has_nul1, has_nul1, has_nul2, cc - mov len, 8 - rev has_nul1, has_nul1 - clz tmp1, has_nul1 - csel len, xzr, len, cc - add len, len, tmp1, lsr 3 + bic src, srcin, 15 + mov wtmp, 0xf00f + ld1 {vdata.16b}, [src] + dup vrepmask.8h, wtmp + cmeq vhas_nul.16b, vdata.16b, 0 + lsl shift, srcin, 2 + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov synd, dend + lsr synd, synd, shift + cbz synd, L(loop) + + rbit synd, synd + clz result, synd + lsr result, result, 2 ret - /* The inner loop processes 32 bytes per iteration and uses the fast - NUL check. If we encounter non-ASCII characters, use a second - loop with the accurate NUL check. */ - .p2align 4 -L(main_loop_entry): - bic src, srcin, 15 - sub src, src, 16 -L(main_loop): - ldp data1, data2, [src, 32]! -L(page_cross_entry): - sub tmp1, data1, zeroones - sub tmp3, data2, zeroones - orr tmp2, tmp1, tmp3 - tst tmp2, zeroones, lsl 7 - bne 1f - ldp data1, data2, [src, 16] - sub tmp1, data1, zeroones - sub tmp3, data2, zeroones - orr tmp2, tmp1, tmp3 - tst tmp2, zeroones, lsl 7 - beq L(main_loop) - add src, src, 16 -1: - /* The fast check failed, so do the slower, accurate NUL check. */ - orr tmp2, data1, REP8_7f - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - beq L(nonascii_loop) - - /* Enter with C = has_nul1 == 0. */ -L(tail): -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul1/2 directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - csel data1, data1, data2, cc - rev data1, data1 - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - bic has_nul1, tmp1, tmp2 -#else - csel has_nul1, has_nul1, has_nul2, cc + .p2align 5 +L(loop): + ldr data, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop) + + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + sub result, src, srcin + fmov synd, dend +#ifndef __AARCH64EB__ + rbit synd, synd #endif - sub len, src, srcin - rev has_nul1, has_nul1 - add tmp2, len, 8 - clz tmp1, has_nul1 - csel len, len, tmp2, cc - add len, len, tmp1, lsr 3 + clz tmp, synd + add result, result, tmp, lsr 2 ret -L(nonascii_loop): - ldp data1, data2, [src, 16]! - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - bne L(tail) - ldp data1, data2, [src, 16]! - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - beq L(nonascii_loop) - b L(tail) - - /* Load 16 bytes from [srcin & ~15] and force the bytes that precede - srcin to 0x7f, so we ignore any NUL bytes before the string. - Then continue in the aligned loop. */ -L(page_cross): - bic src, srcin, 15 - ldp data1, data2, [src] - lsl tmp1, srcin, 3 - mov tmp4, -1 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ -#endif - orr tmp1, tmp1, REP8_80 - orn data1, data1, tmp1 - orn tmp2, data2, tmp1 - tst srcin, 8 - csel data1, data1, tmp4, eq - csel data2, data2, tmp2, eq - b L(page_cross_entry) END (STRLEN) weak_alias (STRLEN, strlen) libc_hidden_builtin_def (strlen)