Message ID | VE1PR08MB5599743405E22528D5F08D7283E49@VE1PR08MB5599.eurprd08.prod.outlook.com |
---|---|
State | Superseded |
Headers |
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org> X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 574C1393AC0E for <patchwork@sourceware.org>; Thu, 22 Jul 2021 16:06:30 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 574C1393AC0E DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1626969990; bh=qjKhxTYhq1TbVwTi+V76Nxn19An/EfBG4Gnx9eYM0Gg=; h=To:Subject:Date:List-Id:List-Unsubscribe:List-Archive:List-Post: List-Help:List-Subscribe:From:Reply-To:Cc:From; b=sbHR8RDf6V8YbMpyosk1Pr1nLuvtRZwPWKdOTcNlznF2mkdelFTp5m/3Grw9bRjy+ N7jXFQuDl9tsl0/Ew/Y6FtfPu9Hz/CGmu5UDlnqPscv6ysSZyix/OOG5TX2+P5rs31 BzWVTr+seppNHssQ+6liMwCMujpiIdA49k8sHETw= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from EUR05-VI1-obe.outbound.protection.outlook.com (mail-vi1eur05on2083.outbound.protection.outlook.com [40.107.21.83]) by sourceware.org (Postfix) with ESMTPS id A19E23889827 for <libc-alpha@sourceware.org>; Thu, 22 Jul 2021 16:05:32 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org A19E23889827 Received: from DB6P192CA0008.EURP192.PROD.OUTLOOK.COM (2603:10a6:4:b8::18) by AM6PR08MB3384.eurprd08.prod.outlook.com (2603:10a6:20b:4a::12) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4331.28; Thu, 22 Jul 2021 16:05:30 +0000 Received: from DB5EUR03FT057.eop-EUR03.prod.protection.outlook.com (2603:10a6:4:b8:cafe::7b) by DB6P192CA0008.outlook.office365.com (2603:10a6:4:b8::18) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4352.25 via Frontend Transport; Thu, 22 Jul 2021 16:05:30 +0000 X-MS-Exchange-Authentication-Results: spf=pass (sender IP is 63.35.35.123) smtp.mailfrom=arm.com; sourceware.org; dkim=pass (signature was verified) header.d=armh.onmicrosoft.com;sourceware.org; dmarc=pass action=none header.from=arm.com; Received-SPF: Pass (protection.outlook.com: domain of arm.com designates 63.35.35.123 as permitted sender) receiver=protection.outlook.com; client-ip=63.35.35.123; helo=64aa7808-outbound-1.mta.getcheckrecipient.com; Received: from 64aa7808-outbound-1.mta.getcheckrecipient.com (63.35.35.123) by DB5EUR03FT057.mail.protection.outlook.com (10.152.20.235) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4352.24 via Frontend Transport; Thu, 22 Jul 2021 16:05:30 +0000 Received: ("Tessian outbound b81a99a0393d:v99"); Thu, 22 Jul 2021 16:05:30 +0000 X-CheckRecipientChecked: true X-CR-MTA-CID: 467b0e1e8536e8bd X-CR-MTA-TID: 64aa7808 Received: from a2f4ed4e03b4.1 by 64aa7808-outbound-1.mta.getcheckrecipient.com id BB302076-5B80-42BC-9CA4-85F6F99810AF.1; Thu, 22 Jul 2021 16:04:35 +0000 Received: from EUR03-AM5-obe.outbound.protection.outlook.com by 64aa7808-outbound-1.mta.getcheckrecipient.com with ESMTPS id a2f4ed4e03b4.1 (version=TLSv1.2 cipher=ECDHE-RSA-AES256-GCM-SHA384); Thu, 22 Jul 2021 16:04:35 +0000 ARC-Seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=g/fXntQ+1eeDzOnuaVK7ovDldgpsz/GKmYXOejMLd50nmVQGXMKR3/CZLK0/Vj9wTMquTfCoaJz1sdaw5is28iEJiTKXYwHInOflp8KcrspKm+14mexQSLVfKCIq5VpjreTTFNdkSwgH48vBAvLUTYlg08v0iVoWjXW/zBtJ5g2oXjjnSBQvIBBORqFkQwHAIJsEZQlMhZk6EtH0IqHNtZXjBQZS4NnMX9+u4FlCGgMOQ/HIJOlVLroeY0UhYaXonPtcW/IjXciYsaxoCGmNDColY/RvjSoFpeoQyh6/KnSuAj0RtY5a/VahbiElgaL41mAzksCNESZTvSRx59MKZw== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=qjKhxTYhq1TbVwTi+V76Nxn19An/EfBG4Gnx9eYM0Gg=; b=kfq6D+eG5mTdNCFGlNgL9f/+RjybwWVIcZFBQgUbG1ZRgEul4dGW2Oq7YewjugFltqUEEflC3+n7HOaIWNnvaeAGo/tL5kPt+A3XkUt9IJvu1PM3mXNpyGpG88ScMClAm2aQteZJ9cybWVdyqUQiYDAPsyicuPTnpFWsYUi/g4Wv7dRHSYP1oA18ipPd+MA18aeGTvyNgUxD+Cp1dEhdM4LrNdnqdw9t5i88vUZf0h5sQE+108atAPsYslNUnbkQt1X1UZcQPFKe8RC4M6MRWcl6LhTsBc+G+ZIaprfGZAUD1wYUbIW7jWxNGX8TDZiBI2ZG1hMUEsh4zPVyzlQgnw== ARC-Authentication-Results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass header.d=arm.com; arc=none Received: from VE1PR08MB5599.eurprd08.prod.outlook.com (2603:10a6:800:1a1::12) by VI1PR08MB3664.eurprd08.prod.outlook.com (2603:10a6:803:81::20) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4352.25; Thu, 22 Jul 2021 16:04:34 +0000 Received: from VE1PR08MB5599.eurprd08.prod.outlook.com ([fe80::5ccd:ab57:a64f:e07e]) by VE1PR08MB5599.eurprd08.prod.outlook.com ([fe80::5ccd:ab57:a64f:e07e%7]) with mapi id 15.20.4352.025; Thu, 22 Jul 2021 16:04:34 +0000 To: "naohirot@fujitsu.com" <naohirot@fujitsu.com> Subject: [PATCH v3 5/5] AArch64: Improve A64FX memset Thread-Topic: [PATCH v3 5/5] AArch64: Improve A64FX memset Thread-Index: AQHXfxMmDOwcjJYmP0eOZ48wdBomwQ== Date: Thu, 22 Jul 2021 16:04:34 +0000 Message-ID: <VE1PR08MB5599743405E22528D5F08D7283E49@VE1PR08MB5599.eurprd08.prod.outlook.com> Accept-Language: en-GB, en-US Content-Language: en-GB X-MS-Has-Attach: X-MS-TNEF-Correlator: Authentication-Results-Original: fujitsu.com; dkim=none (message not signed) header.d=none;fujitsu.com; dmarc=none action=none header.from=arm.com; x-ms-publictraffictype: Email X-MS-Office365-Filtering-Correlation-Id: b7b3246b-c183-47e0-b6df-08d94d2a8791 x-ms-traffictypediagnostic: VI1PR08MB3664:|AM6PR08MB3384: X-Microsoft-Antispam-PRVS: <AM6PR08MB33841BCD5886D8E547B621C183E49@AM6PR08MB3384.eurprd08.prod.outlook.com> x-checkrecipientrouted: true nodisclaimer: true x-ms-oob-tlc-oobclassifiers: OLM:4714;OLM:4714; X-MS-Exchange-SenderADCheck: 1 X-MS-Exchange-AntiSpam-Relay: 0 X-Microsoft-Antispam-Untrusted: BCL:0; X-Microsoft-Antispam-Message-Info-Original: 9/7NzTGmxWB2XqPvtpBoGhY2YScXDpiaRzcM6QAPnGl4ADQukp+l+Ak7FK8PQPCSaIRSFW2NG2fC1AdPJmPi/Y9xytbsviTJjY5ts3vwyhbOOXzRxOvNOFGtQhCpHtolkbcwkE4Q25RNDEDKG2v9ceX+fdpDnXmK3ijlgBkGmjo7S2dCkqh5+dCjMk6mQl3kxy1o3mLyDEyrtELV+PO+gB8W4IbJeU5FlyrmMWWDCYtzK3g0dq/JdqCq4BmtidRThkdeb9PeFrh7LJUuUZrNTWDEqZCH8kJuS/a6kjnSLFuQyxq3iowAr8U8h4LH9heQYayw1cnv4/ZqCGjLSzUZw5dYXnPKWj93JvBwbmJdVAlu2aytvo5i9R0o4ja9TSP1AOtfy5qwDaaB68eoGZlGiAfdob+nX0mEwt+qO40Jr2kZxYOlZpWgYYHmFAWxEGxfoT/iK7wYi3/REvvSaSXA8AHZ/X9mt7Sl8nIYHqw+w7w/jm9Asv5AFaxOu8SnpxYN5I9rvxiyZGrlFz1jO6WHwTBLM2kbhcms8i27Zja9lIKkU5Ugg89k+rVc70JkMTwM3cIfHQzCe9WiXWcPOcdw/qKKknF5PfkWYcybZ8QebRHDmpTvT+CguqQZ7KQjCy2AYpr/o1k8eq/KiosPDevwxFw4dYIpWTLEh8/cDSO6zWAb4zn2YqRNQhGYANj4AOSc7m7Y7kxf6nmODNM0mxqJ7kEGXKu9oIrixI4O/0/AmBfKbHGi1w12lRKMkjVDLfPjMK3qZw8pHPNgs4P5P0KCnQ== X-Forefront-Antispam-Report-Untrusted: CIP:255.255.255.255; CTRY:; LANG:en; SCL:1; SRV:; IPV:NLI; SFV:NSPM; H:VE1PR08MB5599.eurprd08.prod.outlook.com; PTR:; CAT:NONE; SFS:(4636009)(39850400004)(376002)(136003)(346002)(366004)(396003)(55016002)(186003)(26005)(71200400001)(9686003)(478600001)(66476007)(66946007)(91956017)(64756008)(66556008)(5660300002)(66446008)(8936002)(76116006)(86362001)(33656002)(2906002)(52536014)(4326008)(6916009)(8676002)(6506007)(7696005)(316002)(38100700002)(122000001)(38070700004)(473944003)(357404004); DIR:OUT; SFP:1101; x-ms-exchange-antispam-messagedata-chunkcount: 1 x-ms-exchange-antispam-messagedata-0: =?iso-8859-1?q?CkSOxnNxA1IHGExEzpcV7Rl?= =?iso-8859-1?q?vyzv+InUx3TywfEYKqgGuehlMDeNG3tMlN3DJKRsoDnmdzWikcmr8Qbp8FQe?= =?iso-8859-1?q?qYsp2JA0g25sgbxhHloMwiTnkecGr+lkRLdQMHOrvK4efyFxT3KPGHjALAmX?= =?iso-8859-1?q?EgWcviTjy5jYNM0Bwsbr5U+NTLJ35BjoYfJZurxNfOGSauQEXg8Y1/u0dCbk?= =?iso-8859-1?q?g5uc/p0AChWlBMEdXEKeUDRcXkvZ3k2jlujeqDK9nBLqeV0ys4WqziwuzEwF?= =?iso-8859-1?q?IY/VDK7tfjHFexAm1r+Kr3AiinnJj84sQc/hNxt0T8k+4PJ5h6FdvwbT0Qg7?= =?iso-8859-1?q?ICyERAKwy9Wb5ikG2QO40pNZ7JYgienE4WUFN6sjwVCzBF45xvz+qAyWFgLq?= =?iso-8859-1?q?UVsgPpfxoSnwZTWB7Fe+gDKtJLbW8xYACBoZWW6PyfFhYImKMgLJ99G8Tfyb?= =?iso-8859-1?q?LriNFOJiXqKR0X65b1KPacZonu+nUanMjAVBc2sUE5KCfjjcjNPHSUWjDvr+?= =?iso-8859-1?q?AsomOGaq3Q4QZwBaGGsPbVagcSHAzfkJtpwQPhviqBae+9B/VWiznU4XPGbR?= =?iso-8859-1?q?2ib2uwNTBmeY25kWpVX39UdLkBNeKOV2XEpZw81ga9a2wW6GzruOrm8lg+Zm?= =?iso-8859-1?q?p/WBYCtSUHKCCy4BHOts9b3j5W0z02RziIFXtfAYlwCvs91dXQIyLPJOEdA/?= =?iso-8859-1?q?lsuc7eXgYAYknSkyRaN19H5wfDfxikQNLsp6aMe1/6sFhTnyvGxFX560ETmV?= =?iso-8859-1?q?X1GGffhhLFvOyZwmqMQyWcrhqk4TOc5goKV9JeB15Fb+Q5lNLcp4dcd8srOk?= =?iso-8859-1?q?ZjzB5C0ni1WkMvdBLqOrPPq/kQfErYl4s4vgQ5rOK0kKHOf3PLI7I5NJtpO9?= =?iso-8859-1?q?RpCXtCO9XNeYv1twJeRvVlkAUUtQv01Ajb/Bg2D0mLIoy5mbp0Pj+uAZaKW9?= =?iso-8859-1?q?/6vr/BtsSvM7ZpsEyGVyxqV9al6QbwW4R/EZSZQu3IwXXV8zy6Wh7cJegLME?= =?iso-8859-1?q?523+KTVG3otuuRZvWNu7ctQWS8ShuDEgLNaj/W6eTcFsGObS2V6ZGhyoAB5i?= =?iso-8859-1?q?pxsR75cmb4ATvnjg67smu4r9DMIdXAWdnce7snIxb3Xr5EjBfaTx6ev/00hP?= =?iso-8859-1?q?ylYJru8BLebHDVgZMGmHdjSz8upVhYMeqlbODNs+yFUof5kR79PKOEa2UdI+?= =?iso-8859-1?q?wkJ5jU+wd3oM1OV/JS9Qkkm7uvUPZ59lXKtGcQyYPu26twhuAT55dwbFYeA0?= =?iso-8859-1?q?TSwlB7iU0c/xPrGcZukecXD8aOS7J/lp+fVk/AGwjaI8L/Uocjg9IPU3Tsgr?= =?iso-8859-1?q?vnMXX03aw9HEHC01c80aPuibZIQWutHBasAc/t5o=3D?= x-ms-exchange-transport-forked: True Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable MIME-Version: 1.0 X-MS-Exchange-Transport-CrossTenantHeadersStamped: VI1PR08MB3664 Original-Authentication-Results: fujitsu.com; dkim=none (message not signed) header.d=none;fujitsu.com; dmarc=none action=none header.from=arm.com; X-EOPAttributedMessage: 0 X-MS-Exchange-Transport-CrossTenantHeadersStripped: DB5EUR03FT057.eop-EUR03.prod.protection.outlook.com X-MS-Office365-Filtering-Correlation-Id-Prvs: b4e22b34-f941-4b7d-ff30-08d94d2a65e7 X-Microsoft-Antispam: BCL:0; X-Microsoft-Antispam-Message-Info: 9mAQe1SGMcya+LpYc+6OBjpaDhIp+wuuVdRk47IlaLoZOXFGUQ2IWlx+wBSr2Zyucc83rcjP8m4nG+6zf0i2eKaK3VE2izbjtc4eb2M1DG3e7t8f2dqjuHctNGa0gf5WBjKXLP6A2vj/QEtZhzjRCcwvGa7pnoo/SjxQRFfFpcw/JZxXgCmixAMO72cpW3tB1VaG3dOPJuAF4YoNxMLMmsU8iqwBe00lbSrjF4bet2BXKpMryiVdlqjJcffBMVPvib0ZWr+LeeX4LoIxcmA6a5pWVG0ai0jdOydYZ9jma7wPrDdpy+eMck4hTy3vcsn0MwEQKfnDvbFTPuLFYqYXfGE9Q8kMjblgsv7DVAPgKTtnlBwBsiCS5sDA/Y9jum0zsO2shngSyYqsFhpfWMMnI/TzaJ4xA5qRgOQ8XFx724RN5QpTQH9PhJ9Yax0rXMM3zlQGKSDz4iKlgkKIq64XGAJdubSXv5czX1354kAaDns5kJ1VFjNplqxpIU74MaeRDsenZHULRPOsPQcRPomSASnxu9KO1ISv2DDaq3oQIIQSm6A8/abYRC/OXWyW3QCc9rWmgMVXOpWJnwf7Ehnw4LuKM5ZYua8jPYU7bYMdnyPo/tZNEMwSi/r7rHmc9NnlsdUSDHai+eLZbBt81j5n6Y/DcuH/aX030rvjIkNK4sGF8aWj3VtVIb9l6VOnvs31vdCJyO2nkKNr0UZePYZt4zt0HMN/98Y8dQk7vOPma9IQmB/mN60gRUD5sPTv/YqP X-Forefront-Antispam-Report: CIP:63.35.35.123; CTRY:IE; LANG:en; SCL:1; SRV:; IPV:CAL; SFV:NSPM; H:64aa7808-outbound-1.mta.getcheckrecipient.com; PTR:ec2-63-35-35-123.eu-west-1.compute.amazonaws.com; CAT:NONE; SFS:(4636009)(376002)(346002)(136003)(396003)(39850400004)(36840700001)(46966006)(2906002)(7696005)(82740400003)(55016002)(33656002)(6862004)(70586007)(26005)(186003)(9686003)(8936002)(4326008)(86362001)(356005)(8676002)(36860700001)(6506007)(52536014)(81166007)(70206006)(478600001)(5660300002)(47076005)(316002)(336012)(82310400003)(473944003)(357404004); DIR:OUT; SFP:1101; X-OriginatorOrg: arm.com X-MS-Exchange-CrossTenant-OriginalArrivalTime: 22 Jul 2021 16:05:30.8004 (UTC) X-MS-Exchange-CrossTenant-Network-Message-Id: b7b3246b-c183-47e0-b6df-08d94d2a8791 X-MS-Exchange-CrossTenant-Id: f34e5979-57d9-4aaa-ad4d-b122a662184d X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp: TenantId=f34e5979-57d9-4aaa-ad4d-b122a662184d; Ip=[63.35.35.123]; Helo=[64aa7808-outbound-1.mta.getcheckrecipient.com] X-MS-Exchange-CrossTenant-AuthSource: DB5EUR03FT057.eop-EUR03.prod.protection.outlook.com X-MS-Exchange-CrossTenant-AuthAs: Anonymous X-MS-Exchange-CrossTenant-FromEntityHeader: HybridOnPrem X-MS-Exchange-Transport-CrossTenantHeadersStamped: AM6PR08MB3384 X-Spam-Status: No, score=-12.3 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, RCVD_IN_MSPIKE_H2, SPF_HELO_PASS, SPF_PASS, TXREP, UNPARSEABLE_RELAY autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org> List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>, <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe> List-Archive: <https://sourceware.org/pipermail/libc-alpha/> List-Post: <mailto:libc-alpha@sourceware.org> List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help> List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>, <mailto:libc-alpha-request@sourceware.org?subject=subscribe> From: Wilco Dijkstra via Libc-alpha <libc-alpha@sourceware.org> Reply-To: Wilco Dijkstra <Wilco.Dijkstra@arm.com> Cc: 'GNU C Library' <libc-alpha@sourceware.org> Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org> |
Series |
[v3,1/5] AArch64: Improve A64FX memset
|
|
Commit Message
Wilco Dijkstra
July 22, 2021, 4:04 p.m. UTC
Simplify the code for memsets smaller than L1. Improve the unroll8 and L1_prefetch loops. ---
Comments
Hi Wilco, Thank you for the patch. I confirmed V3 Part 5 performance is better than the master except 16KB dip [1]. See the comparison graphs between the master and V3 Part 5 [1][2][3]. And the 16KB dip can be fixed, see the below. Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com> Tested-by: Naohiro Tamura <naohirot@fujitsu.com> [1] https://drive.google.com/file/d/10ujn5LNOqgI2VpynUc1Adt9U777_Ixxz/view?usp=sharing [2] https://drive.google.com/file/d/14vCq_ng0tFDjo1BRqaMm9m9o3Kntjr0v/view?usp=sharing [3] https://drive.google.com/file/d/1GBFk8czzJV5hB9sT93qB7Rw7pHQzEfRt/view?usp=sharing > -----Original Message----- > From: Wilco Dijkstra <Wilco.Dijkstra@arm.com> > Sent: Friday, July 23, 2021 1:05 AM > To: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com> > Cc: 'GNU C Library' <libc-alpha@sourceware.org> > Subject: [PATCH v3 5/5] AArch64: Improve A64FX memset How about like this? "AArch64: Improve A64FX memset by removing rest variable" > > Simplify the code for memsets smaller than L1. Improve the unroll8 and L1_prefetch loops. > > --- > diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S > index 8665c272431b46dadea53c63ab74829c3aa99312..36628e101db33a9a8ff5234b98dd5a3a5c9ed73c 100644 > --- a/sysdeps/aarch64/multiarch/memset_a64fx.S > +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S > @@ -30,7 +30,6 @@ > #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB > #define CACHE_LINE_SIZE 256 > #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 > -#define rest x2 > #define vector_length x9 > > #if HAVE_AARCH64_SVE_ASM > @@ -89,29 +88,19 @@ ENTRY (MEMSET) > > .p2align 4 > L(vl_agnostic): // VL Agnostic > - mov rest, count > mov dst, dstin > - add dstend, dstin, count > - // if rest >= L2_SIZE && vector_length == 64 then L(L2) > - mov tmp1, 64 > - cmp rest, L2_SIZE > - ccmp vector_length, tmp1, 0, cs > - b.eq L(L2) > - // if rest >= L1_SIZE && vector_length == 64 then L(L1_prefetch) > - cmp rest, L1_SIZE > - ccmp vector_length, tmp1, 0, cs > - b.eq L(L1_prefetch) > - > + cmp count, L1_SIZE > + b.hi L(L1_prefetch) > > + // count >= 8 * vector_length > L(unroll8): > - lsl tmp1, vector_length, 3 > - .p2align 3 > -1: cmp rest, tmp1 > - b.cc L(last) > - st1b_unroll > + sub count, count, tmp1 > + .p2align 4 > +1: subs count, count, tmp1 > + st1b_unroll 0, 7 > add dst, dst, tmp1 > - sub rest, rest, tmp1 > - b 1b > + b.hi 1b > + add count, count, tmp1 > Reverting unroll8 logic to V3 Part 4 fixed 16KB dip [4]. See the comparison graphs between the master and V3 Part 5 fixed [4][5][6]. L(unroll8): lsl tmp1, vector_length, 3 .p2align 3 1: cmp count, tmp1 b.cc L(last) st1b_unroll add dst, dst, tmp1 sub count, count, tmp1 b 1b [4] https://drive.google.com/file/d/1NfaEF24ud8JOpCktlzoeQ5VvyJc593lD/view?usp=sharing [5] https://drive.google.com/file/d/1DfwPenANTwgLm2kqu_w9QugmYNqysOMF/view?usp=sharing [6] https://drive.google.com/file/d/1OL6_gbdevwJmfEeRbEANJ4pVdqaSRZvV/view?usp=sharing Thanks. Naohiro > L(last): > cmp count, vector_length, lsl 1 > @@ -129,18 +118,22 @@ L(last): > st1b z0.b, p0, [dstend, -1, mul vl] > ret > > -L(L1_prefetch): // if rest >= L1_SIZE > + // count >= L1_SIZE > .p2align 3 > +L(L1_prefetch): > + cmp count, L2_SIZE > + b.hs L(L2) > + cmp vector_length, 64 > + b.ne L(unroll8) > 1: st1b_unroll 0, 3 > prfm pstl1keep, [dst, PF_DIST_L1] > st1b_unroll 4, 7 > prfm pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE] > add dst, dst, CACHE_LINE_SIZE * 2 > - sub rest, rest, CACHE_LINE_SIZE * 2 > - cmp rest, L1_SIZE > - b.ge 1b > - cbnz rest, L(unroll8) > - ret > + sub count, count, CACHE_LINE_SIZE * 2 > + cmp count, PF_DIST_L1 > + b.hs 1b > + b L(unroll8) > > // count >= L2_SIZE > L(L2):
Hi Naohiro, > Reverting unroll8 logic to V3 Part 4 fixed 16KB dip [4]. > See the comparison graphs between the master and V3 Part 5 fixed [4][5][6]. I don't see an improvement from the old unroll8 loop - there is about 2% benefit on 16KB, but all other sizes become slower. At size 1K it is 50% slower... I tried some other variations and moving the SUBS to the end of the loop appears slightly better overall, so I've done that for the v4 patch. Cheers, Wilco
Hi Wilco, Thank you for the comment and V4 Patch. > From: Wilco Dijkstra <Wilco.Dijkstra@arm.com> > Sent: Monday, 9 August 2021 23:52 > > > Reverting unroll8 logic to V3 Part 4 fixed 16KB dip [4]. > > See the comparison graphs between the master and V3 Part 5 fixed [4][5][6]. > > I don't see an improvement from the old unroll8 loop - there is about 2% > benefit on 16KB, but all other sizes become slower. At size 1K it is 50% > slower... I tried some other variations and moving the SUBS to the end > of the loop appears slightly better overall, so I've done that for the v4 patch. If our results are different each other, that must be due to test environment difference. In my test environment, Patch V4 has a little bit lager 16KB dip than Patch V3 [1]. So let me ask you the same question I asked in the mail [2] before. How did you measure the performance? Did you use some kind of simulator? Or do you have real A64FX environment? [1] https://drive.google.com/file/d/1NajofMts3aPHwlvuXYXt48kyGPXPcJfk/view?usp=sharing [2] https://sourceware.org/pipermail/libc-alpha/2021-July/128313.html Thanks. Naohiro
Hi Naohiro, > In my test environment, Patch V4 has a little bit lager 16KB dip than > Patch V3 [1]. That's odd since nothing significant changed in the loop. The performance variations don't make much sense to me, it seems A64FX is very sensitive to something, but it's not clear what exactly it is. Note that I didn't see anything similar for 16KB in bench-memset-large or bench-memset-walk. However if you'd like to get to the bottom of this, one way would be to write a small benchmark that reproduces the dip (which is likely non-trivial) and use perf stat to get performance counter results for the different loops. > So let me ask you the same question I asked in the mail [2] before. > > How did you measure the performance? Did you use some kind of simulator? > > Or do you have real A64FX environment? Yes, I just ran it on an A64FX machine. Cheers, Wilco
Hi Wilco, > > In my test environment, Patch V4 has a little bit lager 16KB dip than > > Patch V3 [1]. > > That's odd since nothing significant changed in the loop. The performance > variations don't make much sense to me, it seems A64FX is very sensitive > to something, but it's not clear what exactly it is. > I feel it's odd too, but it's reality. The 16KB dip is reproducible in all of three A64FX environments I have, FX1000 master node, FX1000 compute node, and FX700. > Note that I didn't see anything similar for 16KB in bench-memset-large or > bench-memset-walk. However if you'd like to get to the bottom of this, one > way would be to write a small benchmark that reproduces the dip (which is > likely non-trivial) and use perf stat to get performance counter results for the > different loops. I didn't see anything similar for 16KB in bench-memset-large or bench-memset-walk too. As I presented V3 patch fixed, reverting unroll8 code solved the 16KB dip. The following two graphs show for V4 patch fixed that unroll8 code is reverted. The first graph [1] shows comparison the master with V4 fixed. The second graph [2] shows comparison V4 with V4 fixed. [1] https://drive.google.com/file/d/19og4ZhU9itzFAVXX8TIzlpgiiukiXQbp/view?usp=sharing [2] https://drive.google.com/file/d/1wQgPU6GyRQ_Z8ibsGja-NfdKhN5bz7I9/view?usp=sharing In my environment, I don't have any performance degradation by reverting unroll8, but 16KB performance improvement as shown in the graphs. In your environment, do you have any performance degradation by reverting unroll8? If there is no disadvantage by reverting unroll8, why don't we revert it? > > So let me ask you the same question I asked in the mail [2] before. > > > > How did you measure the performance? Did you use some kind of simulator? > > > > Or do you have real A64FX environment? > > Yes, I just ran it on an A64FX machine. Is it HPE Apollo 80 System? Or does ARM Company have an account to Fujitsu FX1000 or FX700? Thanks. Naohiro
Hi Naohiro, > In my environment, I don't have any performance degradation by reverting unroll8, > but 16KB performance improvement as shown in the graphs. I still see a major regression at 1KB in the graph (it is larger relatively than the gain at 16KB), plus many smaller regressions between 2KB-8KB. > In your environment, do you have any performance degradation by reverting unroll8? > If there is no disadvantage by reverting unroll8, why don't we revert it? For me bench-memset shows a 50% regression with the unroll8 loop reverted plus many smaller regressions. So I don't think reverting is a good idea. I tried "perf stat" and oddly enough this loop causes a lot of branch mispredictions. However if you add a branch at the top of the loop that is never taken (eg. blt and ensuring the sub above it sets the flags), it becomes faster than the best results so far. If you can reproduce that, it is probably the best workaround. > Is it HPE Apollo 80 System? > Or does ARM Company have an account to Fujitsu FX1000 or FX700? It has 48 cores, that's all I know... Cheers, Wilco
Hi Wilco, > > In my environment, I don't have any performance degradation by reverting unroll8, > > but 16KB performance improvement as shown in the graphs. > > I still see a major regression at 1KB in the graph (it is larger relatively than the gain at 16KB), > plus many smaller regressions between 2KB-8KB. Are you talking about the regression between V4 and V4 fixed? If so, that is also observed in my environment as shown in the graph [2]. But V4 fixed is not degraded than the master as shown in the graph [1]. I think we are getting almost same result each other, but not exactly same, right? > > The first graph [1] shows comparison the master with V4 fixed. > > The second graph [2] shows comparison V4 with V4 fixed. > > > > [1] https://drive.google.com/file/d/19og4ZhU9itzFAVXX8TIzlpgiiukiXQbp/view?usp=sharing > > [2] https://drive.google.com/file/d/1wQgPU6GyRQ_Z8ibsGja-NfdKhN5bz7I9/view?usp=sharing > > In your environment, do you have any performance degradation by reverting unroll8? > > If there is no disadvantage by reverting unroll8, why don't we revert it? > > For me bench-memset shows a 50% regression with the unroll8 loop reverted plus > many smaller regressions. So I don't think reverting is a good idea. If the 50% regression in your environment is at 1KB, the regression at 1KB happens in my environment too as shown in the graph [4], but the rate seems less than 50%. Both your result and my result are true and real. I don't think it's rational to make decision by looking at only one environment result. As I explained at the bottom of this mail, V4 code is tuned to Applo 80 and FX700. So we need to take FX1000 into account too. > I tried "perf stat" and oddly enough this loop causes a lot of branch mispredictions. > However if you add a branch at the top of the loop that is never taken (eg. blt and > ensuring the sub above it sets the flags), it becomes faster than the best results so far. > If you can reproduce that, it is probably the best workaround. Does "it becomes faster than the best results so far" mean faster than the master? I think we should put the baseline or bottom line to the master performance. If the workaround is not faster than or equal to the master at 16KB which has the peak performance, reverting unroll8 is preferable. I'm not sure if I understood what the workaround code looks like, is it like this? L(unroll8): sub count, count, tmp1 .p2align 4 1: subs tmp2, xzr, xzr b.lt 1b st1b_unroll 0, 7 add dst, dst, tmp1 subs count, count, tmp1 b.hi 1b add count, count, tmp1 > > Is it HPE Apollo 80 System? > > Or does ARM Company have an account to Fujitsu FX1000 or FX700? > > It has 48 cores, that's all I know... I think your environment must be Applo 80 or FX700 which has 48 cores and 4 NUMA nodes. FX1000 master node has 52 cores and FX1000 compute node has 50 cores. OS sees FX1000 as if it has 8 NUMA nodes. Thanks. Naohiro
Fixed a typo inline > -----Original Message----- > From: Libc-alpha <libc-alpha-bounces+naohirot=fujitsu.com@sourceware.org> On Behalf Of naohirot--- via Libc-alpha > Sent: Tuesday, August 24, 2021 4:56 PM > To: Wilco Dijkstra <Wilco.Dijkstra@arm.com> > Cc: 'GNU C Library' <libc-alpha@sourceware.org> > Subject: RE: [PATCH v3 5/5] AArch64: Improve A64FX memset > > Hi Wilco, > > > > In my environment, I don't have any performance degradation by reverting unroll8, > > > but 16KB performance improvement as shown in the graphs. > > > > I still see a major regression at 1KB in the graph (it is larger relatively than the gain at 16KB), > > plus many smaller regressions between 2KB-8KB. > > Are you talking about the regression between V4 and V4 fixed? > If so, that is also observed in my environment as shown in the graph [2]. > But V4 fixed is not degraded than the master as shown in the graph [1]. > > I think we are getting almost same result each other, but not exactly same, right? > > > > The first graph [1] shows comparison the master with V4 fixed. > > > The second graph [2] shows comparison V4 with V4 fixed. > > > > > > [1] https://drive.google.com/file/d/19og4ZhU9itzFAVXX8TIzlpgiiukiXQbp/view?usp=sharing > > > [2] https://drive.google.com/file/d/1wQgPU6GyRQ_Z8ibsGja-NfdKhN5bz7I9/view?usp=sharing > > > > In your environment, do you have any performance degradation by reverting unroll8? > > > If there is no disadvantage by reverting unroll8, why don't we revert it? > > > > For me bench-memset shows a 50% regression with the unroll8 loop reverted plus > > many smaller regressions. So I don't think reverting is a good idea. > > If the 50% regression in your environment is at 1KB, the regression at 1KB happens > in my environment too as shown in the graph [4], but the rate seems less than 50%. > "the graph [4]" should be "the graph [2]". Thanks. Naohiro > Both your result and my result are true and real. > I don't think it's rational to make decision by looking at only one environment result. > > As I explained at the bottom of this mail, V4 code is tuned to Applo 80 and FX700. > So we need to take FX1000 into account too. > > > I tried "perf stat" and oddly enough this loop causes a lot of branch mispredictions. > > However if you add a branch at the top of the loop that is never taken (eg. blt and > > ensuring the sub above it sets the flags), it becomes faster than the best results so far. > > If you can reproduce that, it is probably the best workaround. > > Does "it becomes faster than the best results so far" mean faster than the master? > I think we should put the baseline or bottom line to the master performance. > If the workaround is not faster than or equal to the master at 16KB which has the peak > performance, reverting unroll8 is preferable. > > I'm not sure if I understood what the workaround code looks like, is it like this? > > L(unroll8): > sub count, count, tmp1 > .p2align 4 > 1: subs tmp2, xzr, xzr > b.lt 1b > st1b_unroll 0, 7 > add dst, dst, tmp1 > subs count, count, tmp1 > b.hi 1b > add count, count, tmp1 > > > > Is it HPE Apollo 80 System? > > > Or does ARM Company have an account to Fujitsu FX1000 or FX700? > > > > It has 48 cores, that's all I know... > > I think your environment must be Applo 80 or FX700 which has 48 cores and 4 NUMA nodes. > FX1000 master node has 52 cores and FX1000 compute node has 50 cores. > OS sees FX1000 as if it has 8 NUMA nodes. > > Thanks. > Naohiro
Hi Naohiro, > Are you talking about the regression between V4 and V4 fixed? > If so, that is also observed in my environment as shown in the graph [2]. I was talking about your graph [2] - my results are below. > But V4 fixed is not degraded than the master as shown in the graph [1]. That may be true but it has lost a lot of the performance gains of V4 just to improve the 16KB datapoint in one benchmark. I don't believe that is a good tradeoff. > I think we are getting almost same result each other, but not exactly same, right? > > If the 50% regression in your environment is at 1KB, the regression at 1KB happens > in my environment too as shown in the graph [4], but the rate seems less than 50%. Yes the differences are at similar sizes but with different magnitude. > Both your result and my result are true and real. > I don't think it's rational to make decision by looking at only one environment result. It's odd the behaviour with the same CPU isn't identical. If there is a way to make them behave more similarly, I would love to hear it! In any case it would be good to know how the blt workaround works on your system. > Does "it becomes faster than the best results so far" mean faster than the master? With best result I mean fastest of V4 and V4 with unroll8. These are the results I get for bench-memset compared to V4 (higher = faster): v4+blt v4+unroll8 0-512 0.01% 0.00% 1K-4K -0.15% -3.07% 4K-8K 0.11% -0.04% 16K 3.56% 1.98% 32K 0.74% -0.71% 64K 1.91% 0.53% 128K 0.23% 0.10% So the blt workaround improves performance of larger sizes far more than unroll8, and most importantly, it doesn't regress smaller sizes like unroll8. > I think we should put the baseline or bottom line to the master performance. > If the workaround is not faster than or equal to the master at 16KB which has the peak > performance, reverting unroll8 is preferable. A new implementation does not need to beat a previous version on every single size. It would be impossibly hard to achieve that - an endless game of whack-a-mole... So I always look for better performance overall and for commonly used size ranges (see above table, V4+blt is 0.6% faster overall than V4+unroll8). We should avoid major regressions of course, so the question is whether we can tweak V4 a little so that it does better around 16KB without losing any of its performance gains. My results show that is possible with the blt workaround, but not with the unroll8 loop. > I'm not sure if I understood what the workaround code looks like, is it like this? It just injects a single blt at the top of the loop and changes the sub before the loop to subs, so you get something like this: subs count, count, tmp1 .p2align 4 1: b.lt last I can propose a patch for this workaround if it isn't clear. > I think your environment must be Applo 80 or FX700 which has 48 cores and 4 NUMA nodes. > FX1000 master node has 52 cores and FX1000 compute node has 50 cores. > OS sees FX1000 as if it has 8 NUMA nodes. I do see 4 NUMA nodes indeed, but performance isn't affected at all by which node you select (at least on bench-memset since it runs from L1/L2). Cheers, Wilco
Hi Wilco, > It's odd the behaviour with the same CPU isn't identical. If there is a way to make them > behave more similarly, I would love to hear it! In any case it would be good to know > how the blt workaround works on your system. You can see the difference between FX1000 and FX700 (==Apollo 80) [1]. The number of cores and clock are different at least. And the blt workaround worked for only FX700 but not for FX1000 as explained below. [1] https://www.fujitsu.com/global/products/computing/servers/supercomputer/specifications/ > > I'm not sure if I understood what the workaround code looks like, is it like this? > > It just injects a single blt at the top of the loop and changes the sub before the > loop to subs, so you get something like this: > > subs count, count, tmp1 > .p2align 4 > 1: b.lt last > > I can propose a patch for this workaround if it isn't clear. If you agree to the cmp and branch workaround (2 instructions at the beginning of the loop) below, I'll submit a patch. 1) Result of the blt workaround (1 instruction at the beginning of the loop) I tried two patterns, subs count, count, tmp1 .p2align 4 1: b.lt L(last) and sub count, count, tmp1 .p2align 4 1: cbnz xzr, L(last) Both patterns worked for only FX700, but not FX1000. FX700 master vs v4fix 1 instruction [2] FX700 v4 vs v4fix 1 instruction [3] FX1000 master vs v4fix 1 instruction [4] FX1000 v4 vs v4fix 1 instruction [5] [2] https://drive.google.com/file/d/1IBsPYg2ia2t1YyMmaYVb7tFG89njO2aq/view?usp=sharing [3] https://drive.google.com/file/d/1q44gqOWZvFhzKAe2di5y8EQrRwWkgxoU/view?usp=sharing [4] https://drive.google.com/file/d/1P10oD0-WO8J5t7QiP7QwgqOqlAZ2I5hn/view?usp=sharing [5] https://drive.google.com/file/d/1wKv-bPx20LgJyWl761gXiKLsPwhukEzx/view?usp=sharing 2) Result of the cmp and branch workaround (2 instructions at the beginning of the loop) I tried two patterns, sub count, count, tmp1 .p2align 4 1: subs tmp2, xzr, xzr b.lt 1b and sub count, count, tmp1 .p2align 4 1: cmp xzr, xzr b.ne 1b Both patterns worked for FX700 and FX1000. FX700 master vs v4fix 2 instructions [6] FX700 v4 vs v4fix 2 instructions [7] FX1000 master vs v4fix 2 instructions [8] FX1000 v4 vs v4fix 2 instructions [9] [6] https://drive.google.com/file/d/1B-CsRGT1rJFQCMHja78DEflQ-JHxSkGf/view?usp=sharing [7] https://drive.google.com/file/d/1KCriikc1jIKEKLFoaTV0jYTqhtvmbblh/view?usp=sharing [8] https://drive.google.com/file/d/1sunelmZ30jpd_aeWKXu65XNkS9X_akWb/view?usp=sharing [9] https://drive.google.com/file/d/1JaJG0I79VMSTGy2PqaZf1SILujE69Gi2/view?usp=sharing Thanks. Naohiro
Hi Naohiro, > You can see the difference between FX1000 and FX700 (==Apollo 80) [1]. > The number of cores and clock are different at least. > > And the blt workaround worked for only FX700 but not for FX1000 as explained below. > > [1] https://www.fujitsu.com/global/products/computing/servers/supercomputer/specifications/ So it looks like they are different silicon and likely slightly different microarchitectures which would explain the different behaviour. > If you agree to the cmp and branch workaround (2 instructions at the beginning of the loop) > below, I'll submit a patch. Yes, the 2 instruction workaround is clearly the best solution so far. It fixes the dips around 16KB but doesn't regress anything else. The results v4 vs v4fix [9] show there are even some uplifts in the 1-8KB range. [9] https://drive.google.com/file/d/1JaJG0I79VMSTGy2PqaZf1SILujE69Gi2/view?usp=sharing > 2) Result of the cmp and branch workaround (2 instructions at the beginning of the loop) It's interesting this works on both systems, however it's still a mystery why... It would be a good idea to ask your CPU team about this. Cheers, Wilco
Hi Wilco, > > If you agree to the cmp and branch workaround (2 instructions at the beginning of the loop) > > below, I'll submit a patch. > > Yes, the 2 instruction workaround is clearly the best solution so far. It fixes the dips > around 16KB but doesn't regress anything else. The results v4 vs v4fix [9] show there > are even some uplifts in the 1-8KB range. Thank you for the review. I submitted a patch [1], please find it. [1] https://sourceware.org/pipermail/libc-alpha/2021-August/130569.html > > 2) Result of the cmp and branch workaround (2 instructions at the beginning of the loop) > > It's interesting this works on both systems, however it's still a mystery why... > It would be a good idea to ask your CPU team about this. OK. In the meanwhile you can find the microarchitecture manual [2] if you're interested in. [2] https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_Microarchitecture_Manual_en_1.5.pdf Thanks. Naohiro
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S index 8665c272431b46dadea53c63ab74829c3aa99312..36628e101db33a9a8ff5234b98dd5a3a5c9ed73c 100644 --- a/sysdeps/aarch64/multiarch/memset_a64fx.S +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S @@ -30,7 +30,6 @@ #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB #define CACHE_LINE_SIZE 256 #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 -#define rest x2 #define vector_length x9 #if HAVE_AARCH64_SVE_ASM @@ -89,29 +88,19 @@ ENTRY (MEMSET) .p2align 4 L(vl_agnostic): // VL Agnostic - mov rest, count mov dst, dstin - add dstend, dstin, count - // if rest >= L2_SIZE && vector_length == 64 then L(L2) - mov tmp1, 64 - cmp rest, L2_SIZE - ccmp vector_length, tmp1, 0, cs - b.eq L(L2) - // if rest >= L1_SIZE && vector_length == 64 then L(L1_prefetch) - cmp rest, L1_SIZE - ccmp vector_length, tmp1, 0, cs - b.eq L(L1_prefetch) - + cmp count, L1_SIZE + b.hi L(L1_prefetch) + // count >= 8 * vector_length L(unroll8): - lsl tmp1, vector_length, 3 - .p2align 3 -1: cmp rest, tmp1 - b.cc L(last) - st1b_unroll + sub count, count, tmp1 + .p2align 4 +1: subs count, count, tmp1 + st1b_unroll 0, 7 add dst, dst, tmp1 - sub rest, rest, tmp1 - b 1b + b.hi 1b + add count, count, tmp1 L(last): cmp count, vector_length, lsl 1 @@ -129,18 +118,22 @@ L(last): st1b z0.b, p0, [dstend, -1, mul vl] ret -L(L1_prefetch): // if rest >= L1_SIZE + // count >= L1_SIZE .p2align 3 +L(L1_prefetch): + cmp count, L2_SIZE + b.hs L(L2) + cmp vector_length, 64 + b.ne L(unroll8) 1: st1b_unroll 0, 3 prfm pstl1keep, [dst, PF_DIST_L1] st1b_unroll 4, 7 prfm pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE] add dst, dst, CACHE_LINE_SIZE * 2 - sub rest, rest, CACHE_LINE_SIZE * 2 - cmp rest, L1_SIZE - b.ge 1b - cbnz rest, L(unroll8) - ret + sub count, count, CACHE_LINE_SIZE * 2 + cmp count, PF_DIST_L1 + b.hs 1b + b L(unroll8) // count >= L2_SIZE L(L2):