From patchwork Thu May 19 11:32:47 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Wilco Dijkstra X-Patchwork-Id: 12376 Received: (qmail 96905 invoked by alias); 19 May 2016 11:33:16 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 96887 invoked by uid 89); 19 May 2016 11:33:15 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.9 required=5.0 tests=BAYES_00, SPF_PASS autolearn=ham version=3.3.2 spammy=WilcoDijkstraarmcom, wilcodijkstraarmcom, wilco.dijkstra@arm.com, Wilco.Dijkstra@arm.com X-HELO: eu-smtp-delivery-143.mimecast.com From: Wilco Dijkstra To: 'GNU C Library' , Marcus Shawcroft CC: nd Subject: Re: [PATCH][AArch64] Tune memcpy Date: Thu, 19 May 2016 11:32:47 +0000 Message-ID: References: , In-Reply-To: x-ms-office365-filtering-correlation-id: 87fe46d3-4ea6-4003-2b6f-08d37fd94dc7 x-microsoft-exchange-diagnostics: 1; HE1PR08MB1164; 5:1Agdb83sCqVuGHEMsZ3KfzIl+1fdqSQ8ENjCyX5rwIEhflRdciahEaXF5xsaWxx4QTkO4SCJ5t9HK102bjHFsbyLingu5kkLauJGp+numJ5gqSf4aEn76jZxWG55AgCKJqKyckicMkqQS5T39hAcIw==; 24:phjH1SpobVjd+kFt3FkKRNK4nGfDM5e6Tlg4kK3O3jNZN9j93pYefkdnclrVBKDPt4cC2qrb5Dc655TeCGmrpGa3du+MX6xUzwD/7NYEO98=; 7:pv3AdMTI3nwlCytA1WR4kvbI1fWk9NZfho1XbroJrpa/RzjFfLcd+wXmJbNcBf30VN5sff4xLwzBWU/zk7gyvDTh0rr85GokgsIj9TlHXJ0Osp+Thnn4FhmT9c02HFzwmjCtnB4wBeBCYLSA53NTxdIlkRSV02t5nBbQI6q4r5oPCipsMg3O1dWaM33ep5Iz; 20:v/rWjkW1EcGuLL1sGsGrM63qYheceiLgtDvdVtPgmJulUu4HIFFGHfvtj6sOJQxCZXEqaubmwPULDJc/C08kRhIjTvtIxIx9sWy9mf6owkRHbw9CD6MsTHryW27hTaC9MrJf5b1CCEtfAQb0acAHCp7IjSeg7cOu04sn0kwtxKQ= x-microsoft-antispam: UriScan:;BCL:0;PCL:0;RULEID:;SRVR:HE1PR08MB1164; nodisclaimer: True x-microsoft-antispam-prvs: x-exchange-antispam-report-test: UriScan:; x-exchange-antispam-report-cfa-test: BCL:0; PCL:0; RULEID:(601004)(2401047)(8121501046)(5005006)(10201501046)(3002001)(6055026); SRVR:HE1PR08MB1164; BCL:0; PCL:0; RULEID:; SRVR:HE1PR08MB1164; x-forefront-prvs: 094700CA91 x-forefront-antispam-report: SFV:NSPM; SFS:(10009020)(6009001)(377424004)(13464003)(3900700001)(19580395003)(3280700002)(3660700001)(9686002)(76176999)(87936001)(1220700001)(54356999)(50986999)(2906002)(450100001)(76576001)(66066001)(4326007)(8936002)(189998001)(586003)(3846002)(6116002)(5004730100002)(81166006)(102836003)(8676002)(5003600100002)(5008740100001)(5001770100001)(5250100002)(74316001)(5002640100001)(19580405001)(11100500001)(2950100001)(2900100001)(33656002)(86362001)(92566002)(357404004); DIR:OUT; SFP:1101; SCL:1; SRVR:HE1PR08MB1164; H:AM3PR08MB0088.eurprd08.prod.outlook.com; FPR:; SPF:None; MLV:sfv; LANG:en; spamdiagnosticoutput: 1:23 spamdiagnosticmetadata: NSPM MIME-Version: 1.0 X-OriginatorOrg: arm.com X-MS-Exchange-CrossTenant-originalarrivaltime: 19 May 2016 11:32:47.6713 (UTC) X-MS-Exchange-CrossTenant-fromentityheader: Hosted X-MS-Exchange-CrossTenant-id: f34e5979-57d9-4aaa-ad4d-b122a662184d X-MS-Exchange-Transport-CrossTenantHeadersStamped: HE1PR08MB1164 X-MC-Unique: akVScUm7QQSSK3cgtBoW3Q-1 ping diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S index 51e7268..6b8610e 100644 --- a/sysdeps/aarch64/memcpy.S +++ b/sysdeps/aarch64/memcpy.S @@ -35,6 +35,7 @@ #define A_h x7 #define A_hw w7 #define B_l x8 +#define B_lw w8 #define B_h x9 #define C_l x10 #define C_h x11 @@ -70,21 +71,40 @@ END (memmove) libc_hidden_builtin_def (memmove) ENTRY (memcpy) + prfm PLDL1KEEP, [src] add srcend, src, count add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) cmp count, 96 b.hi L(copy_long) - cmp count, 16 - b.hs L(copy_medium) + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 /* Small copies: 0..16 bytes. */ L(copy16): - tbz count, 3, 1f + cmp count, 8 + b.lo 1f ldr A_l, [src] ldr A_h, [srcend, -8] str A_l, [dstin] str A_h, [dstend, -8] ret + .p2align 4 1: tbz count, 2, 1f ldr A_lw, [src] @@ -92,33 +112,21 @@ L(copy16): str A_lw, [dstin] str A_hw, [dstend, -4] ret - .p2align 4 + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1: cbz count, 2f + lsr tmp1, count, 1 ldrb A_lw, [src] - tbz count, 1, 1f - ldrh A_hw, [srcend, -2] - strh A_hw, [dstend, -2] -1: strb A_lw, [dstin] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] 2: ret .p2align 4 - /* Medium copies: 17..96 bytes. */ -L(copy_medium): - ldp A_l, A_h, [src] - tbnz count, 6, L(copy96) - ldp D_l, D_h, [srcend, -16] - tbz count, 5, 1f - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] -1: - stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] - ret - - .p2align 4 /* Copy 64..96 bytes. Copy 64 bytes from the start and 32 bytes from the end. */ L(copy96):