Message ID | VE1PR08MB559938483A0630B16E8C64B383E49@VE1PR08MB5599.eurprd08.prod.outlook.com |
---|---|
State | Superseded |
Headers |
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org> X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 7D7B8393AC0E for <patchwork@sourceware.org>; Thu, 22 Jul 2021 16:05:43 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 7D7B8393AC0E DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1626969943; bh=jEe+vN1py4oED362ndRHl9Wj05x/N2GFs7JVwPLxQE4=; h=To:Subject:Date:List-Id:List-Unsubscribe:List-Archive:List-Post: List-Help:List-Subscribe:From:Reply-To:Cc:From; b=S3qNe3xQ26RPHINd2NvWHIDnuQtXHt/48XOtqhoUIXgIfdWcFHdhwmMVcrKNKKA00 QPcH6w0q6WSQfxWl9LNvM7Kg8rIaIIx8OBc9aZcsHt+wIxyhFTjW4Oo5NYehrAL7cN 1goSsPJ55shWf67S7y4DBUjDvm0xeoyCAqJotEAQ= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from EUR01-DB5-obe.outbound.protection.outlook.com (mail-eopbgr150041.outbound.protection.outlook.com [40.107.15.41]) by sourceware.org (Postfix) with ESMTPS id 9DD613848409 for <libc-alpha@sourceware.org>; Thu, 22 Jul 2021 16:05:20 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 9DD613848409 Received: from PR0P264CA0070.FRAP264.PROD.OUTLOOK.COM (2603:10a6:100:1d::34) by VI1PR0801MB1888.eurprd08.prod.outlook.com (2603:10a6:800:89::10) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4331.29; Thu, 22 Jul 2021 16:05:18 +0000 Received: from VE1EUR03FT026.eop-EUR03.prod.protection.outlook.com (2603:10a6:100:1d:cafe::35) by PR0P264CA0070.outlook.office365.com (2603:10a6:100:1d::34) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4352.25 via Frontend Transport; Thu, 22 Jul 2021 16:05:18 +0000 X-MS-Exchange-Authentication-Results: spf=pass (sender IP is 63.35.35.123) smtp.mailfrom=arm.com; sourceware.org; dkim=pass (signature was verified) header.d=armh.onmicrosoft.com;sourceware.org; dmarc=pass action=none header.from=arm.com; Received-SPF: Pass (protection.outlook.com: domain of arm.com designates 63.35.35.123 as permitted sender) receiver=protection.outlook.com; client-ip=63.35.35.123; helo=64aa7808-outbound-1.mta.getcheckrecipient.com; Received: from 64aa7808-outbound-1.mta.getcheckrecipient.com (63.35.35.123) by VE1EUR03FT026.mail.protection.outlook.com (10.152.18.148) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4352.24 via Frontend Transport; Thu, 22 Jul 2021 16:05:18 +0000 Received: ("Tessian outbound 57330d0f8f60:v99"); Thu, 22 Jul 2021 16:05:17 +0000 X-CheckRecipientChecked: true X-CR-MTA-CID: 95a550db2db729b4 X-CR-MTA-TID: 64aa7808 Received: from bfe171db8e6d.1 by 64aa7808-outbound-1.mta.getcheckrecipient.com id C56D9E99-28B1-4FC7-BC75-E5FFFA039654.1; Thu, 22 Jul 2021 16:00:46 +0000 Received: from EUR04-HE1-obe.outbound.protection.outlook.com by 64aa7808-outbound-1.mta.getcheckrecipient.com with ESMTPS id bfe171db8e6d.1 (version=TLSv1.2 cipher=ECDHE-RSA-AES256-GCM-SHA384); Thu, 22 Jul 2021 16:00:46 +0000 ARC-Seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=fvT/2wJ4JxOAcaNViiY9NXlJWYFHRx0hkjGr6CAFwAzR6RehjxtnGTum0dPJKHl8L55KbHCjjHATpu3r0dKFhu+s1h5bYN3JDzlZgQA9YNwf7zlQgg2qwZd2ComotfkTlYh0t09LYjRHRIrv9AWFE66+Gfdv1y0I6zxT3DWHHjndvLH1eEa39ffa1XRddTC/g12R1JvF3hLJxQUosdQLjtQoJhgnTZm4Ql54f2lt+SKVgl5810uCrCLWwMiMuEjLo6zdaJKEPU/3iD+eJB+LCgQ/Ihkw8RiYEBaT72WxQSUZEsW1UQnC3hH5nSDRuhaSIraa0ECwnHKVFx6F84tzEw== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=jEe+vN1py4oED362ndRHl9Wj05x/N2GFs7JVwPLxQE4=; b=PiFgC6hApTBFKnx03Yai72yAEkfXBylXLeuuucmBDD/YqCvnmNFtegDsT3SkdqJwfe/GqeWQtFddyPFB+biN8qIhmcVdW2soz89Ty03A3/1uFAZSTq62d/aEydu6U0nPuCUWe90ryyCs0iK/7T2+5Cwjezg2LnLBoLyeQW7KOb2AieLWZfqZjaO8F0O+ACFrsjhRaSpwWXNRTPeAcXb8MoxJkPOptwF9AH5Zp9xZvbb/JajvrE8G/XnzoxDqWkpfUYA/a0yQtEydPNHia1cjr2S4yuSyLuKMW46syowd724VLOKwIeUpncRbhFCh0rKTvl7PklcrJLYAIg2N50HDAQ== ARC-Authentication-Results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass header.d=arm.com; arc=none Received: from VE1PR08MB5599.eurprd08.prod.outlook.com (2603:10a6:800:1a1::12) by VE1PR08MB5872.eurprd08.prod.outlook.com (2603:10a6:800:1aa::16) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4352.24; Thu, 22 Jul 2021 16:00:44 +0000 Received: from VE1PR08MB5599.eurprd08.prod.outlook.com ([fe80::5ccd:ab57:a64f:e07e]) by VE1PR08MB5599.eurprd08.prod.outlook.com ([fe80::5ccd:ab57:a64f:e07e%7]) with mapi id 15.20.4352.025; Thu, 22 Jul 2021 16:00:44 +0000 To: "naohirot@fujitsu.com" <naohirot@fujitsu.com> Subject: [PATCH v3 2/5] AArch64: Improve A64FX memset Thread-Topic: [PATCH v3 2/5] AArch64: Improve A64FX memset Thread-Index: AQHXfxKaeH7mpjfTrkCBy3mxGimKXg== Date: Thu, 22 Jul 2021 16:00:44 +0000 Message-ID: <VE1PR08MB559938483A0630B16E8C64B383E49@VE1PR08MB5599.eurprd08.prod.outlook.com> Accept-Language: en-GB, en-US Content-Language: en-GB X-MS-Has-Attach: X-MS-TNEF-Correlator: Authentication-Results-Original: fujitsu.com; dkim=none (message not signed) header.d=none;fujitsu.com; dmarc=none action=none header.from=arm.com; x-ms-publictraffictype: Email X-MS-Office365-Filtering-Correlation-Id: e4ee4af8-ae24-4d02-39c2-08d94d2a801b x-ms-traffictypediagnostic: VE1PR08MB5872:|VI1PR0801MB1888: X-Microsoft-Antispam-PRVS: <VI1PR0801MB1888207D24AA02B84F6EF10783E49@VI1PR0801MB1888.eurprd08.prod.outlook.com> x-checkrecipientrouted: true nodisclaimer: true x-ms-oob-tlc-oobclassifiers: OLM:5236;OLM:5236; X-MS-Exchange-SenderADCheck: 1 X-MS-Exchange-AntiSpam-Relay: 0 X-Microsoft-Antispam-Untrusted: BCL:0; X-Microsoft-Antispam-Message-Info-Original: BhySWc1xUkP5+jIGBjMJUynYcR7VlPYnVdS09u8qYaU/qJaQo3I0aatXXtvaJAWw5DdcVEyqY4sRBwZ4/3bZD6MRgXggEI6KYE8twr3W8PSr8UL2SoRmzzPc1bRdmz9MZ1zHUj15jitEOXVSR2FiI/Ww5z8G5PjWdvGkChq+SmgzLSUA9b2Hpj9//6qeFanr8XknojtR0TjaoQs9+oCeUvdQeST/BHld3Sry4N6NvA/hhlnyWtqgz28f9D5U9n2BCAozZPbA3oJvo3MtOxs53SlFnLMEM8mjFNvpgoM1/ifDvbvGg+Hbr5XX6HRFU+qtWo/e+RHXvz8qCg8av/r7gOL7SPe4Oa30dDisx62tJPeD/X2/TMoMOtLJRGCI0ki7yS7M6PQX5iHPH3t0R5aBVjP8GnwXPW4Gn0xZR/5nmRIQWTTtfrkUj8ylHtR6o4DQrvPaMolgHWaKB1/LS0N5FmhuD6F811bfEfFqbErjrfdEYQj+Ao0Su79u1Wue2onLwsujcB/swqunsb3IpmlFw2xHpUvnCrhskSjalvm8Mv+25WAOm2CBJlcfJV9ua0mxZrgBlk6wCI8dex15Y4YNCyUfVZrtFgInuuwlGphycISK7/yynTolVylapKgGq6E8Nx30qpw6aTCCuOAgvaBStSQ8x+8xG+UGU56wB02Q5ZN5iSO+MQhX82WH9u9CCNREeNlnxez2OtaxF+5+Q5EBWnpB6WTsQHVbbgfHpyrK+d8= X-Forefront-Antispam-Report-Untrusted: CIP:255.255.255.255; CTRY:; LANG:en; SCL:1; SRV:; IPV:NLI; SFV:NSPM; H:VE1PR08MB5599.eurprd08.prod.outlook.com; PTR:; CAT:NONE; SFS:(4636009)(366004)(396003)(39860400002)(136003)(376002)(346002)(478600001)(2906002)(86362001)(6506007)(8676002)(6916009)(122000001)(38100700002)(316002)(55016002)(33656002)(7696005)(9686003)(186003)(4326008)(5660300002)(66446008)(52536014)(8936002)(66946007)(66556008)(71200400001)(76116006)(26005)(91956017)(64756008)(66476007)(38070700004)(357404004); DIR:OUT; SFP:1101; x-ms-exchange-antispam-messagedata-chunkcount: 1 x-ms-exchange-antispam-messagedata-0: =?iso-8859-1?q?jFe3EY0jE7CB0lCG6s6dQwD?= =?iso-8859-1?q?vYS7ipk2kf7nPfs/EMCoFVvxZbTOUY7UzvW9voH+dh3jKuahTNxDhdO1bAPB?= =?iso-8859-1?q?Gom0PD/UypFP1phsH+mN31PIHSNq9KysN8pergZxNahmqX2+S3eYMONS071e?= =?iso-8859-1?q?sRJ4UaqrZriecE51KDj2FulFrV90IhNz24lww3nprOmHeV7nMpbSCn7j/1To?= =?iso-8859-1?q?x3RGIuQ0JZOt2Q6xpEENecsrc5L3nAjRnbxBKl5F9aL6z6L0C53LZDB8t20P?= =?iso-8859-1?q?LBQ6ZAn+D1xrx6awvX6joUWLwStGeT1NC+Ee0WsFtBbWXLCT3pDh3gV1ejw6?= =?iso-8859-1?q?gj32vQPSXMa/iRw9dDlhM9hBgy4kNICK4E6JQAIgiRxUsWHMuKT/uf/fc7oI?= =?iso-8859-1?q?kF0k1KjDXdZUddlzMW7kEEryee7saGGaCcynraXCWsIfYD2NmYHC2oFmiQtI?= =?iso-8859-1?q?m+Fw6mmoaFzAAd48UotYgs7tUu53dJ/ufev+fifLZSULGlODSigXrcRLHx+V?= =?iso-8859-1?q?RUxp01SjZve/iK8hecyVoQ0c9LNfov2bI11iYNF42fnhxUsD1TCHEu4i/tsR?= =?iso-8859-1?q?iHn7Lb7FIQ9DnE7ui3LvlWao+iUPfcozcCSmzOb9+hF+m6aSp5A43fAeWGJc?= =?iso-8859-1?q?xpsZz/fmpR6LzZ2ftpu/u1hZ6dTSSYwsizkOTjo+kEcYeAjGmvNm4OwqlN2Z?= =?iso-8859-1?q?uXsv7/HnpLPX47z6VmcSsXH5a5Ps4ZcJzjoPJi4FKEBbGgZeALyoKWzKEVNT?= =?iso-8859-1?q?zwL2uS11xaXLcbl+mxDRCHkTKp7ByS1KVEb056BLcRqKsUEKYjgTumK4Jt57?= =?iso-8859-1?q?1EBEoJDlRS23l3o0tebTiZVMOIuKYgrkF6x36yk4MZ4JewQ7YIVKyQNJTzuS?= =?iso-8859-1?q?d5PWOZdSXJ7HURqOGsPcpP1ZTeWtYbYU5WM+8hgCVnYpoRLCY3MzDf6M/+/k?= =?iso-8859-1?q?Pg0XRI4W+Tr8hQIauRbm8pEwGBwzfsmOubIoap2xK/1qKjZDwNV1Dgb9J+3r?= =?iso-8859-1?q?2ZZNE8nLAVPe9jwdFpi6Qvq3cGTQCzlr4ClVzYfT3rf6CqZCYxj18nCWthVB?= =?iso-8859-1?q?2f+3CPyfLFIYZNZoBx7lMcrx+baRVi6LfFebJ3RCWK2bH6K5jek+rfF6tcb7?= =?iso-8859-1?q?c5g348s1qA20DJaarjmwZW1T6QCUQUxi2JjIPsZ3FGFCirImAf0Fk2ua5N6v?= =?iso-8859-1?q?FqCoqI65qcbIboGyH9pAjX+o5DT5+IdoD9NiBr7UMOByDENErGLbe5IDM83B?= =?iso-8859-1?q?7nzPCxmWVoaGXYoVD0ZdxiqvM4LvYiN/yrd5dPRbTzPoLIcqxBZfsCZTZ21r?= =?iso-8859-1?q?oB/bs2xt/qsLuYR9Kd/Wec6QW7/0EHKuJnnRheUE=3D?= x-ms-exchange-transport-forked: True Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable MIME-Version: 1.0 X-MS-Exchange-Transport-CrossTenantHeadersStamped: VE1PR08MB5872 Original-Authentication-Results: fujitsu.com; dkim=none (message not signed) header.d=none;fujitsu.com; dmarc=none action=none header.from=arm.com; X-EOPAttributedMessage: 0 X-MS-Exchange-Transport-CrossTenantHeadersStripped: VE1EUR03FT026.eop-EUR03.prod.protection.outlook.com X-MS-Office365-Filtering-Correlation-Id-Prvs: b755f83a-4a6c-422d-eb56-08d94d29dd1c X-Microsoft-Antispam: BCL:0; X-Microsoft-Antispam-Message-Info: NcAFIZyZ+0dUD/6qWWt7txq+TKSz+Lw/WtajVUjikb7wXF6sA8hP914y2/i20vhErr98uI+mjUpJEAtYeXCzCo5glBW1Lob7ppugPFlxwoSv70PqlMNJpndXINb1BeGgTPJrl9fOvX08nqOz6cW//EF5Ecclw7XNBoIvAfNHU3awvfU4u0Nw5OO0fq7afBGV/Pb8Jr7B9nHtZqJ44vFBGTMNOA5uglG+1FHbRAj2ZSCE2c/QCx/BF/pJyodtoPbChH6zTSFf/RkyhQEqix8OL3VHU7xwfczWhYUUBdM3zzDzy+jeFrqPsvIGDQVkKHPrUO66qnqkR7r5VkdgkPT7+qqxnmxO+j7GB6EXWqikVmyw0fjvsKccWDHOmhjyKW5t4uaJZx8x5f7snDviUqXXplnWMn7Tr39Z86uJU5cDZBP6P3G6spcmlyxmWf8NecCSqqCwiiqZpy4ZwL0hLMaNDuaitRuaHZI7bIQu6B2cN/UElVu1+oUkIC+4DEYFFuuqjs84gDPrk8NxI7EJpsJ9FTNvKWpslNUlVFrft61yehEo75VzBxNuqvQECn7Fj26b5MpeSYgNWg6D9Ijhbwh4N7N1A0KNu/YELf/5qZXYSP4r+F2M1NPKFAJcB8/be2OWnvJUZdWKRXYmFyGz0/GIAjmWY1ztS17SQ474DN5TCIgwMJzoBZVzrAba7Er6fIZzKuEkUEyqzr42pwUeibK3HF66a7J7E9Hlq6mupIYE6eM= X-Forefront-Antispam-Report: CIP:63.35.35.123; CTRY:IE; LANG:en; SCL:1; SRV:; IPV:CAL; SFV:NSPM; H:64aa7808-outbound-1.mta.getcheckrecipient.com; PTR:ec2-63-35-35-123.eu-west-1.compute.amazonaws.com; CAT:NONE; SFS:(4636009)(136003)(39850400004)(376002)(346002)(396003)(36840700001)(46966006)(186003)(4326008)(7696005)(478600001)(47076005)(82740400003)(356005)(81166007)(52536014)(6862004)(36860700001)(26005)(55016002)(5660300002)(9686003)(2906002)(6506007)(316002)(8936002)(86362001)(8676002)(70206006)(33656002)(70586007)(336012)(82310400003)(357404004); DIR:OUT; SFP:1101; X-OriginatorOrg: arm.com X-MS-Exchange-CrossTenant-OriginalArrivalTime: 22 Jul 2021 16:05:18.1656 (UTC) X-MS-Exchange-CrossTenant-Network-Message-Id: e4ee4af8-ae24-4d02-39c2-08d94d2a801b X-MS-Exchange-CrossTenant-Id: f34e5979-57d9-4aaa-ad4d-b122a662184d X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp: TenantId=f34e5979-57d9-4aaa-ad4d-b122a662184d; Ip=[63.35.35.123]; Helo=[64aa7808-outbound-1.mta.getcheckrecipient.com] X-MS-Exchange-CrossTenant-AuthSource: VE1EUR03FT026.eop-EUR03.prod.protection.outlook.com X-MS-Exchange-CrossTenant-AuthAs: Anonymous X-MS-Exchange-CrossTenant-FromEntityHeader: HybridOnPrem X-MS-Exchange-Transport-CrossTenantHeadersStamped: VI1PR0801MB1888 X-Spam-Status: No, score=-12.3 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, RCVD_IN_MSPIKE_H2, SPF_HELO_PASS, SPF_PASS, TXREP, UNPARSEABLE_RELAY autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org> List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>, <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe> List-Archive: <https://sourceware.org/pipermail/libc-alpha/> List-Post: <mailto:libc-alpha@sourceware.org> List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help> List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>, <mailto:libc-alpha-request@sourceware.org?subject=subscribe> From: Wilco Dijkstra via Libc-alpha <libc-alpha@sourceware.org> Reply-To: Wilco Dijkstra <Wilco.Dijkstra@arm.com> Cc: 'GNU C Library' <libc-alpha@sourceware.org> Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org> |
Series |
[v3,1/5] AArch64: Improve A64FX memset
|
|
Commit Message
Wilco Dijkstra
July 22, 2021, 4 p.m. UTC
Improve performance of large memsets. Simplify alignment code. For zero memset use DC ZVA, which almost doubles performance. For non-zero memsets use the unroll8 loop which is about 10% faster. ---
Comments
Hi Wilco, Thank you for the patch. I confirmed that the performance is improved than the master as shown in the graphs [1][2]. Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com> Tested-by: Naohiro Tamura <naohirot@fujitsu.com> [1] https://drive.google.com/file/d/1RxdIlJa2Wvl8eT5_TRVkvbkyS6bfZObx/view?usp=sharing [2] https://drive.google.com/file/d/1xCLsa7qweovdQpWtfnNZZwcEi7W7z3Ok/view?usp=sharing There are one comment and one question below. > Subject: [PATCH v3 2/5] AArch64: Improve A64FX memset How about "AArch64: Improve A64FX memset for more than 8MB"? > > Improve performance of large memsets. Simplify alignment code. For zero memset use DC ZVA, > which almost doubles performance. For non-zero memsets use the unroll8 loop which is about 10% faster. > > --- > > diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S > index f7fcc7b323e1553f50a2e005b8ccef344a08127d..608e0e2e2ff5259178e2fdadf1eea8816194d879 100644 > --- a/sysdeps/aarch64/multiarch/memset_a64fx.S > +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S > @@ -30,10 +30,8 @@ > #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB > #define CACHE_LINE_SIZE 256 > #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 > -#define rest x8 > +#define rest x2 > #define vector_length x9 > -#define vl_remainder x10 // vector_length remainder > -#define cl_remainder x11 // CACHE_LINE_SIZE remainder > > #if HAVE_AARCH64_SVE_ASM > # if IS_IN (libc) > @@ -41,14 +39,6 @@ > > .arch armv8.2-a+sve > > - .macro dc_zva times > - dc zva, tmp1 > - add tmp1, tmp1, CACHE_LINE_SIZE > - .if \times-1 > - dc_zva "(\times-1)" > - .endif > - .endm > - > .macro st1b_unroll first=0, last=7 > st1b z0.b, p0, [dst, \first, mul vl] > .if \last-\first > @@ -187,54 +177,29 @@ L(L1_prefetch): // if rest >= L1_SIZE > cbnz rest, L(unroll32) > ret > > + // count >= L2_SIZE > L(L2): > - // align dst address at vector_length byte boundary > - sub tmp1, vector_length, 1 > - ands tmp2, dst, tmp1 > - // if vl_remainder == 0 > - b.eq 1f > - sub vl_remainder, vector_length, tmp2 > - // process remainder until the first vector_length boundary > - whilelt p2.b, xzr, vl_remainder > - st1b z0.b, p2, [dst] > - add dst, dst, vl_remainder > - sub rest, rest, vl_remainder > - // align dstin address at CACHE_LINE_SIZE byte boundary > -1: mov tmp1, CACHE_LINE_SIZE > - ands tmp2, dst, CACHE_LINE_SIZE - 1 > - // if cl_remainder == 0 > - b.eq L(L2_dc_zva) > - sub cl_remainder, tmp1, tmp2 > - // process remainder until the first CACHE_LINE_SIZE boundary > - mov tmp1, xzr // index > -2: whilelt p2.b, tmp1, cl_remainder > - st1b z0.b, p2, [dst, tmp1] > - incb tmp1 > - cmp tmp1, cl_remainder > - b.lo 2b > - add dst, dst, cl_remainder > - sub rest, rest, cl_remainder > - > -L(L2_dc_zva): > - // zero fill > - mov tmp1, dst > - dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1 > - mov zva_len, ZF_DIST > - add tmp1, zva_len, CACHE_LINE_SIZE * 2 > - // unroll > - .p2align 3 > -1: st1b_unroll 0, 3 > - add tmp2, dst, zva_len > - dc zva, tmp2 > - st1b_unroll 4, 7 > - add tmp2, tmp2, CACHE_LINE_SIZE > - dc zva, tmp2 > - add dst, dst, CACHE_LINE_SIZE * 2 > - sub rest, rest, CACHE_LINE_SIZE * 2 > - cmp rest, tmp1 // ZF_DIST + CACHE_LINE_SIZE * 2 > - b.ge 1b > - cbnz rest, L(unroll8) > - ret > + tst valw, 255 > + b.ne L(unroll8) > + // align dst to CACHE_LINE_SIZE byte boundary > + and tmp2, dst, CACHE_LINE_SIZE - 1 > + sub tmp2, tmp2, CACHE_LINE_SIZE > + st1b z0.b, p0, [dst, 0, mul vl] > + st1b z0.b, p0, [dst, 1, mul vl] > + st1b z0.b, p0, [dst, 2, mul vl] > + st1b z0.b, p0, [dst, 3, mul vl] > + sub dst, dst, tmp2 > + add count, count, tmp2 > + > + // clear cachelines using DC ZVA > + sub count, count, CACHE_LINE_SIZE > + .p2align 4 > +1: dc zva, dst DC ZVA is called if buffer size is more than 8MB and fill data is zero. In case of __memset_generic, DC ZVA is called if buffer size is more than 256B and fill data is zero. This is implemented by you[3]. V3 Patch 02 __memset_a64fx (green line) recorded very close performance to __memset_generic (red line) in terms of zerofill [4]. V3 Patch 05 __memset_a64fx (green line) recorded almost same performance as __memset_generic (red line) in terms of zerofill [5]. Graphs[4][5] X axis starts from 256B to 64MB, and are created by the following command. $ cat bench-memset-zerofill.out | \ > jq -r 'del(.functions.memset.results[] | select(.char2 != 0))' | \ > plot_strings.py -l -p thru -v - So DC ZVA implementations for __memset_generic and __memset_a64fx seem appropriate respectively. But comparing nonzero fill graph[6] with zero fill graph[4], why DC ZVA is only effective more than 8MB for __memset_a64fx in spite that DC ZVA is effective from smaller size for __memset_generic? Still I couldn't understand DC ZVA behavior. [3] https://sourceware.org/git/?p=glibc.git;a=commit;f=sysdeps/aarch64/memset.S;h=a8c5a2a9521e105da6e96eaf4029b8e4d595e4f5 [4] https://drive.google.com/file/d/1f0_sTiujCcEZTfxbQ1UZdVwAvbMbP2ii/view?usp=sharing [5] https://drive.google.com/file/d/1Wyp3GO-9ipcphwqOQOQ9a97EwFz90SPc/view?usp=sharing [6] https://drive.google.com/file/d/1nZ_lfj6Kz5vFCR35O0q929SceUP-wbih/view?usp=sharing Thanks. Naohiro > + add dst, dst, CACHE_LINE_SIZE > + subs count, count, CACHE_LINE_SIZE > + b.hi 1b > + add count, count, CACHE_LINE_SIZE > + b L(last) > > END (MEMSET) > libc_hidden_builtin_def (MEMSET)
Hi Wilco, I found my typo in the original code comment. Would you fix it with the following? > > -#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance > From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com> > Sent: Monday, August 2, 2021 10:29 PM > > diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S > > index f7fcc7b323e1553f50a2e005b8ccef344a08127d..608e0e2e2ff5259178e2fdadf1eea8816194d879 100644 > > --- a/sysdeps/aarch64/multiarch/memset_a64fx.S > > +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S > > @@ -30,10 +30,8 @@ > > #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB Wrong: // L2 8MB - 1MB Right: // L2 8MB > > #define CACHE_LINE_SIZE 256 > > #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 > > -#define rest x8 > > +#define rest x2 > > #define vector_length x9 > > -#define vl_remainder x10 // vector_length remainder > > -#define cl_remainder x11 // CACHE_LINE_SIZE remainder > > Thanks. Naohiro
Hi Wilco, Sorry, I forgot to mention one thing about readability matter. > From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com> > Sent: Monday, August 2, 2021 10:29 PM > > + // count >= L2_SIZE > > L(L2): > > - // align dst address at vector_length byte boundary > > - sub tmp1, vector_length, 1 > > - ands tmp2, dst, tmp1 > > - // if vl_remainder == 0 > > - b.eq 1f > > - sub vl_remainder, vector_length, tmp2 > > - // process remainder until the first vector_length boundary > > - whilelt p2.b, xzr, vl_remainder > > - st1b z0.b, p2, [dst] > > - add dst, dst, vl_remainder > > - sub rest, rest, vl_remainder > > - // align dstin address at CACHE_LINE_SIZE byte boundary > > -1: mov tmp1, CACHE_LINE_SIZE > > - ands tmp2, dst, CACHE_LINE_SIZE - 1 > > - // if cl_remainder == 0 > > - b.eq L(L2_dc_zva) > > - sub cl_remainder, tmp1, tmp2 > > - // process remainder until the first CACHE_LINE_SIZE boundary > > - mov tmp1, xzr // index > > -2: whilelt p2.b, tmp1, cl_remainder > > - st1b z0.b, p2, [dst, tmp1] > > - incb tmp1 > > - cmp tmp1, cl_remainder > > - b.lo 2b > > - add dst, dst, cl_remainder > > - sub rest, rest, cl_remainder > > - > > -L(L2_dc_zva): > > - // zero fill > > - mov tmp1, dst > > - dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1 > > - mov zva_len, ZF_DIST > > - add tmp1, zva_len, CACHE_LINE_SIZE * 2 > > - // unroll > > - .p2align 3 > > -1: st1b_unroll 0, 3 > > - add tmp2, dst, zva_len > > - dc zva, tmp2 > > - st1b_unroll 4, 7 > > - add tmp2, tmp2, CACHE_LINE_SIZE > > - dc zva, tmp2 > > - add dst, dst, CACHE_LINE_SIZE * 2 > > - sub rest, rest, CACHE_LINE_SIZE * 2 > > - cmp rest, tmp1 // ZF_DIST + CACHE_LINE_SIZE * 2 > > - b.ge 1b > > - cbnz rest, L(unroll8) > > - ret > > + tst valw, 255 > > + b.ne L(unroll8) > > + // align dst to CACHE_LINE_SIZE byte boundary > > + and tmp2, dst, CACHE_LINE_SIZE - 1 > > + sub tmp2, tmp2, CACHE_LINE_SIZE "tmp2" becomes always minus value. I felt that it would be easier to understand and natural if it is reversed like this: sub tmp2, CACHE_LINE_SIZE, tmp2 > > + st1b z0.b, p0, [dst, 0, mul vl] > > + st1b z0.b, p0, [dst, 1, mul vl] > > + st1b z0.b, p0, [dst, 2, mul vl] > > + st1b z0.b, p0, [dst, 3, mul vl] > > + sub dst, dst, tmp2 "dst" needs to be incremented. Actually "dst" is incremented by "sub" because "tmp2" is minus value. So it would become natural if "tmp2" is plus value like this: add dst, dst, tmp2 > > + add count, count, tmp2 "count" needs to be decremented. Actually "count" is decremented by "add" because "tmp2" is minus value. So it would become natural if tmp2 is plus value like this: sub count, count, tmp2 Thanks. Naohiro
Hi Naohiro, > > + // align dst to CACHE_LINE_SIZE byte boundary > > + and tmp2, dst, CACHE_LINE_SIZE - 1 > > + sub tmp2, tmp2, CACHE_LINE_SIZE > "tmp2" becomes always minus value. > I felt that it would be easier to understand and natural if it is reversed like this: > > sub tmp2, CACHE_LINE_SIZE, tmp2 That's not a valid instruction though. I've just removed it in v4 since we can delay the cacheline adjustment to dst and count to later instructions. > But comparing nonzero fill graph[6] with zero fill graph[4], > why DC ZVA is only effective more than 8MB for __memset_a64fx in spite > that DC ZVA is effective from smaller size for __memset_generic? Well it seems on A64FX DC ZVA is faster only when data is not in L1. So it may be feasible to use DC ZVA for smaller sizes. Cheers, Wilco
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S index f7fcc7b323e1553f50a2e005b8ccef344a08127d..608e0e2e2ff5259178e2fdadf1eea8816194d879 100644 --- a/sysdeps/aarch64/multiarch/memset_a64fx.S +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S @@ -30,10 +30,8 @@ #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB #define CACHE_LINE_SIZE 256 #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 -#define rest x8 +#define rest x2 #define vector_length x9 -#define vl_remainder x10 // vector_length remainder -#define cl_remainder x11 // CACHE_LINE_SIZE remainder #if HAVE_AARCH64_SVE_ASM # if IS_IN (libc) @@ -41,14 +39,6 @@ .arch armv8.2-a+sve - .macro dc_zva times - dc zva, tmp1 - add tmp1, tmp1, CACHE_LINE_SIZE - .if \times-1 - dc_zva "(\times-1)" - .endif - .endm - .macro st1b_unroll first=0, last=7 st1b z0.b, p0, [dst, \first, mul vl] .if \last-\first @@ -187,54 +177,29 @@ L(L1_prefetch): // if rest >= L1_SIZE cbnz rest, L(unroll32) ret + // count >= L2_SIZE L(L2): - // align dst address at vector_length byte boundary - sub tmp1, vector_length, 1 - ands tmp2, dst, tmp1 - // if vl_remainder == 0 - b.eq 1f - sub vl_remainder, vector_length, tmp2 - // process remainder until the first vector_length boundary - whilelt p2.b, xzr, vl_remainder - st1b z0.b, p2, [dst] - add dst, dst, vl_remainder - sub rest, rest, vl_remainder - // align dstin address at CACHE_LINE_SIZE byte boundary -1: mov tmp1, CACHE_LINE_SIZE - ands tmp2, dst, CACHE_LINE_SIZE - 1 - // if cl_remainder == 0 - b.eq L(L2_dc_zva) - sub cl_remainder, tmp1, tmp2 - // process remainder until the first CACHE_LINE_SIZE boundary - mov tmp1, xzr // index -2: whilelt p2.b, tmp1, cl_remainder - st1b z0.b, p2, [dst, tmp1] - incb tmp1 - cmp tmp1, cl_remainder - b.lo 2b - add dst, dst, cl_remainder - sub rest, rest, cl_remainder - -L(L2_dc_zva): - // zero fill - mov tmp1, dst - dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1 - mov zva_len, ZF_DIST - add tmp1, zva_len, CACHE_LINE_SIZE * 2 - // unroll - .p2align 3 -1: st1b_unroll 0, 3 - add tmp2, dst, zva_len - dc zva, tmp2 - st1b_unroll 4, 7 - add tmp2, tmp2, CACHE_LINE_SIZE - dc zva, tmp2 - add dst, dst, CACHE_LINE_SIZE * 2 - sub rest, rest, CACHE_LINE_SIZE * 2 - cmp rest, tmp1 // ZF_DIST + CACHE_LINE_SIZE * 2 - b.ge 1b - cbnz rest, L(unroll8) - ret + tst valw, 255 + b.ne L(unroll8) + // align dst to CACHE_LINE_SIZE byte boundary + and tmp2, dst, CACHE_LINE_SIZE - 1 + sub tmp2, tmp2, CACHE_LINE_SIZE + st1b z0.b, p0, [dst, 0, mul vl] + st1b z0.b, p0, [dst, 1, mul vl] + st1b z0.b, p0, [dst, 2, mul vl] + st1b z0.b, p0, [dst, 3, mul vl] + sub dst, dst, tmp2 + add count, count, tmp2 + + // clear cachelines using DC ZVA + sub count, count, CACHE_LINE_SIZE + .p2align 4 +1: dc zva, dst + add dst, dst, CACHE_LINE_SIZE + subs count, count, CACHE_LINE_SIZE + b.hi 1b + add count, count, CACHE_LINE_SIZE + b L(last) END (MEMSET) libc_hidden_builtin_def (MEMSET)