From patchwork Tue Oct 22 16:33:52 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Wilco Dijkstra X-Patchwork-Id: 35224 Received: (qmail 74990 invoked by alias); 22 Oct 2019 16:34:11 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 74978 invoked by uid 89); 22 Oct 2019 16:34:10 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-19.5 required=5.0 tests=AWL, BAYES_00, FORGED_SPF_HELO, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_MANYTO, RCVD_IN_DNSWL_NONE, SPF_HELO_PASS autolearn=ham version=3.3.1 spammy= X-HELO: EUR03-AM5-obe.outbound.protection.outlook.com DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=armh.onmicrosoft.com; s=selector2-armh-onmicrosoft-com; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=N/IdrQDDz4SO0ao/f6N1TXnyy4H3bnRfNZ7bWmn0hi8=; b=SnpflVS61sT3HpZqHWncsIQptKag2CKgophItdMecY3/nUo7vhAgZppsHjoEHOh7S5sHAPvUeM1MioQpN34/OycuMO8bN+Cv8zI23iYQzNcMGOj2F+vBoq5FLlsrUP3DebVySEdEbUvBRmOjE45DPjGpH8L3rVyw5HcmNJYF8bc= Authentication-Results: spf=temperror (sender IP is 63.35.35.123) smtp.mailfrom=arm.com; sourceware.org; dkim=pass (signature was verified) header.d=armh.onmicrosoft.com; sourceware.org; dmarc=none action=none header.from=arm.com; Received-SPF: TempError (protection.outlook.com: error in processing during lookup of arm.com: DNS Timeout) X-CheckRecipientChecked: true X-CR-MTA-CID: ed5d1da112a149a4 X-CR-MTA-TID: 64aa7808 ARC-Seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=ek51X6b31L1Hnd/Ngcf++fy95bCh35kCgHR0NzOMHN1W0h5E0+q9qLxQqd+I6h7giF6Lz7DECeR+LUcQU+iP8JdS89caa/4AaBHjZjTY3uWKWOQwcQWj7rCd08LR9YS/s/mTcnjuy1zdjWOg0QVnyvRenU3ECW5SXZxKgcj6ZkoWq1zMGT4HadjZ7zRKMQ6Ftwd6BZ08E4A5ayLdXlrCbzrd27slFTz0RSpb80drW7xwSiADGBSztdGbTf0KRPiYByDVtsCGOZg1HSA5kUEPhvlVnnM9dLTZd5t/St5iwVaU6x4hTDUfRUjR9klY+39rqs7ftzH5HyKwCCimqix4VQ== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=N/IdrQDDz4SO0ao/f6N1TXnyy4H3bnRfNZ7bWmn0hi8=; b=JUQiujYHAMFewaccqKGhKI22qENBSTc5/Yo2Tr/JnZzRliLO7Wj9rCROKCHkSFa0ebhnKXAR2rKSJ9+F/SpVTo8dEpCVOf+z0aW+s9pJthmb1rMntQci5y8Q1cklo3GKeoOum3IdLGfb8DTw2RdaGyzaMCfjCseZOtk4Qe8LDjuTWci7cqR+ybUlVx0RkkFV+PbxMKQxYZQdA76oTgdLXG19k27CHUfXHmszYPW85VvSFFkMqGHZkD9O1p0cbbhqWBMaMSJMLGFY/500NGiKVmTLrJkV3RrSvhwlXeTsPSHWmw7s94SOeW0opw4O40eN7sHVp4sL2eWZhGKG0fWnGQ== ARC-Authentication-Results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass header.d=arm.com; arc=none DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=armh.onmicrosoft.com; s=selector2-armh-onmicrosoft-com; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=N/IdrQDDz4SO0ao/f6N1TXnyy4H3bnRfNZ7bWmn0hi8=; b=SnpflVS61sT3HpZqHWncsIQptKag2CKgophItdMecY3/nUo7vhAgZppsHjoEHOh7S5sHAPvUeM1MioQpN34/OycuMO8bN+Cv8zI23iYQzNcMGOj2F+vBoq5FLlsrUP3DebVySEdEbUvBRmOjE45DPjGpH8L3rVyw5HcmNJYF8bc= From: Wilco Dijkstra To: Xuelei Zhang , "libc-alpha@sourceware.org" , "siddhesh@gotplt.org" , Szabolcs Nagy , "jiangyikun@huawei.com" , "yikunkero@gmail.com" CC: nd Subject: Re: [PATCH v2] aarch64: Optimized strlen for strlen_asimd Date: Tue, 22 Oct 2019 16:33:52 +0000 Message-ID: References: <20191022094118.11468-1-zhangxuelei4@huawei.com> In-Reply-To: <20191022094118.11468-1-zhangxuelei4@huawei.com> Authentication-Results-Original: spf=none (sender IP is ) smtp.mailfrom=Wilco.Dijkstra@arm.com; x-ms-exchange-transport-forked: True x-checkrecipientrouted: true x-ms-oob-tlc-oobclassifiers: OLM:506;OLM:506; X-Forefront-Antispam-Report-Untrusted: SFV:NSPM; SFS:(10009020)(4636009)(346002)(136003)(396003)(39860400002)(366004)(376002)(189003)(199004)(102836004)(2501003)(7736002)(81156014)(71190400001)(186003)(7696005)(8676002)(81166006)(76176011)(6506007)(99286004)(71200400001)(26005)(86362001)(76116006)(66066001)(476003)(486006)(11346002)(2201001)(110136005)(229853002)(74316002)(305945005)(33656002)(66946007)(66446008)(64756008)(446003)(478600001)(14454004)(3846002)(9686003)(6246003)(8936002)(6116002)(55016002)(5660300002)(256004)(2906002)(25786009)(66556008)(6436002)(4326008)(316002)(66476007)(52536014); DIR:OUT; SFP:1101; SCL:1; SRVR:VI1PR0801MB1757; H:VI1PR0801MB2127.eurprd08.prod.outlook.com; FPR:; SPF:None; LANG:en; PTR:InfoNoRecords; MX:1; A:1; received-spf: None (protection.outlook.com: arm.com does not designate permitted sender hosts) X-MS-Exchange-SenderADCheck: 1 X-Microsoft-Antispam-Untrusted: BCL:0; X-Microsoft-Antispam-Message-Info-Original: HINMsSZIy6gSc1E4CZi3XAZSHRIh3Oy/nBHnU7W1nskPQkGu4HKL7qxzm3XrOwKt486EbwDvoV+SeOefZUc/hhgh1fAlBEEHJAEsnB10wdQWwQsVF/uFcgPJhC4hPRabLhdToQlzntOSsy8US2HKZywHe9pGI2tXzTgbbwVAUT9KEg+eoaS+eiCZE9dqDFctswPmRrXIvud1YT8ENoayf+gEBREhwNhCGu8v5ii+tlabG6xaFvlcM9FbsP4IGpIrrO7FDJ22bjXGWY7boIpA2ZbFAGg587RJmcYX9u7f4+cChnrkUfrDMPrRb5Xss+kvJKowqCPY+ypQDdwLMPLtV8AsrheFkJm/xhyumBNgiG61pN/S8K/1HA6UwG1YMtcn7SKIqq6I3pz7zdLvLpNlTtYY/s8PdojSxExw9fIeJOXoadLih0QYPbgBQy8NeLZK MIME-Version: 1.0 Original-Authentication-Results: spf=none (sender IP is ) smtp.mailfrom=Wilco.Dijkstra@arm.com; Return-Path: Wilco.Dijkstra@arm.com X-MS-Exchange-Transport-CrossTenantHeadersStripped: VE1EUR03FT022.eop-EUR03.prod.protection.outlook.com X-MS-Office365-Filtering-Correlation-Id-Prvs: 67343220-b521-4409-fe54-08d7570d9fbd Hi Xuelei, > Optimize the strlen implementation by using vector operations and > loop unrolling in main loop.Compared to __strlen_generic,it reduces > latency of cases in bench-strlen by 7%~18% when the length of src > is greater than 128 bytes, with gains throughout the benchmark. This is a good improvement, OK to commit. Also given it uses integer arithmetic for the first 16 bytes, it can never be worse off than the generic variant for small inputs. Wilco OK diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c index 1db01babeec..abf6513eeea 100644 --- a/sysdeps/aarch64/multiarch/strlen.c +++ b/sysdeps/aarch64/multiarch/strlen.c @@ -34,7 +34,9 @@ extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden; extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden; libc_ifunc (__strlen, - (USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic)); + (USE_ASIMD_STRLEN () || IS_KUNPENG(midr) + ? __strlen_asimd + :__strlen_generic)); # undef strlen strong_alias (__strlen, strlen); diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S index 1d1c6abb825..1de6cd3a173 100644 --- a/sysdeps/aarch64/multiarch/strlen_asimd.S +++ b/sysdeps/aarch64/multiarch/strlen_asimd.S @@ -48,6 +48,9 @@ #define dataq2 q3 #define datav2 v3 +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + #ifdef TEST_PAGE_CROSS # define MIN_PAGE_SIZE 16 #else @@ -82,40 +85,47 @@ ENTRY_ALIGN (__strlen_asimd, 6) DELOUSE (0) DELOUSE (1) and tmp1, srcin, MIN_PAGE_SIZE - 1 + mov zeroones, REP8_01 cmp tmp1, MIN_PAGE_SIZE - 16 b.gt L(page_cross) - ldr dataq, [srcin] + ldp data1, data2, [srcin] #ifdef __AARCH64EB__ - rev64 datav.16b, datav.16b + rev data1, data1 + rev data2, data2 #endif - /* Get the minimum value and keep going if it is not zero. */ - uminv datab2, datav.16b - mov tmp1, datav2.d[0] - cbnz tmp1, L(main_loop_entry) - - cmeq datav.16b, datav.16b, #0 - mov data1, datav.d[0] - mov data2, datav.d[1] - cmp data1, 0 - csel data1, data1, data2, ne + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(main_loop_entry) + csel has_nul1, has_nul1, has_nul2, cc mov len, 8 - rev data1, data1 - clz tmp1, data1 - csel len, xzr, len, ne + rev has_nul1, has_nul1 + clz tmp1, has_nul1 + csel len, xzr, len, cc add len, len, tmp1, lsr 3 ret L(main_loop_entry): bic src, srcin, 15 + sub src, src, 16 L(main_loop): - ldr dataq, [src, 16]! + ldr dataq, [src, 32]! L(page_cross_entry): /* Get the minimum value and keep going if it is not zero. */ uminv datab2, datav.16b mov tmp1, datav2.d[0] + cbz tmp1, L(tail) + ldr dataq, [src, 16] + uminv datab2, datav.16b + mov tmp1, datav2.d[0] cbnz tmp1, L(main_loop) + add src, src, 16 L(tail): #ifdef __AARCH64EB__