From patchwork Fri Apr 15 12:44:15 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Wilco Dijkstra X-Patchwork-Id: 11757 Received: (qmail 48870 invoked by alias); 15 Apr 2016 12:44:32 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 48855 invoked by uid 89); 15 Apr 2016 12:44:31 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.9 required=5.0 tests=BAYES_00, SPF_PASS autolearn=ham version=3.3.2 spammy=November, november, 7021 X-HELO: eu-smtp-delivery-143.mimecast.com From: Wilco Dijkstra To: 'GNU C Library' CC: nd , Richard Earnshaw , "Marcus Shawcroft" Subject: Re: [PATCH][AArch64] Tune memcpy Date: Fri, 15 Apr 2016 12:44:15 +0000 Message-ID: References: In-Reply-To: x-ms-office365-filtering-correlation-id: 98fb8e8a-d05a-4da7-ac2d-08d3652ba73f x-microsoft-exchange-diagnostics: 1; VI1PR08MB1165; 5:nD9Gl6V73eUO+b4Z85Szj+LOKcT+L2YSM7Bq7m/R51M9vAE2kGBrvQrUQMtBgzCQxbGYJ3QzWdUDzFoS28y0K7LhH5BH3Rm3F3o//Fnp6hdenYu+6QuVxE3dPv2jWSrE+w53oecBg42HrlPsFEKOm89xdMeXjExWCu2vMAwtCzNX7Y3Oq/TUBYRvYKkD7eSt; 24:eOo2BpmInaT5QwtacCwEAXfwi358O53edLSeNs4qXRnSBnFxwtZ08+VmQL1sTL+eIZLYhPupah/+ttPvYu6hFgkvDYtZLiKwylrRDGRGSMM=; 7:UUIQ6z5T4thyghx4v7MmnuA3b6tEKSiyQf+yXHYadvNTuPxpymcE+YvGD1fT3NBuT3HuVqZTWYNfN76X4P0ATDdUSyAzoZwf0OXn1haRSJ9ik8fOKjqSd/6Ndl+VzeK2SoUFOBruJUEV1jx4EFJ0P0sIACtXQoIspiR6VP31Ixr0RiDJR/YbJ4WEddMZgoFNGm7E36PWk4+FGz91E+Vd+4BLng1tLDgM8edKhCpSEvU=; 20:RmkCieobABpY+2QYmhdYXDd/jZdGYVwblD1rRtmrOcXVCz6cNlxra/6CcjsRcGB4pMDMnIkIb+2JdHPiuaJJFIf7t6D1+AxFByAsZ7YkvLwuQZkVNvLWSy22FT4LQ1Hro+saAmTgL7jsx8gZ9dzNsvtrdCNQYr9lU5DTTM6Ov5E= x-microsoft-antispam: UriScan:;BCL:0;PCL:0;RULEID:;SRVR:VI1PR08MB1165; nodisclaimer: True x-microsoft-antispam-prvs: x-exchange-antispam-report-test: UriScan:; x-exchange-antispam-report-cfa-test: BCL:0; PCL:0; RULEID:(9101521026)(601004)(2401047)(8121501046)(5005006)(3002001)(10201501046)(6055026); SRVR:VI1PR08MB1165; BCL:0; PCL:0; RULEID:; SRVR:VI1PR08MB1165; x-forefront-prvs: 0913EA1D60 x-forefront-antispam-report: SFV:NSPM; SFS:(10009020)(6009001)(13464003)(377424004)(4326007)(3660700001)(2906002)(9686002)(15975445007)(92566002)(2950100001)(3280700002)(76576001)(5003600100002)(1096002)(3900700001)(74316001)(2900100001)(33656002)(54356999)(5004730100002)(1220700001)(66066001)(110136002)(189998001)(76176999)(5002640100001)(19580405001)(19580395003)(86362001)(450100001)(3846002)(5250100002)(102836003)(87936001)(5008740100001)(11100500001)(50986999)(586003)(81166005)(6116002); DIR:OUT; SFP:1101; SCL:1; SRVR:VI1PR08MB1165; H:AM3PR08MB0088.eurprd08.prod.outlook.com; FPR:; SPF:None; MLV:sfv; LANG:en; spamdiagnosticoutput: 1:23 spamdiagnosticmetadata: NSPM MIME-Version: 1.0 X-OriginatorOrg: arm.com X-MS-Exchange-CrossTenant-originalarrivaltime: 15 Apr 2016 12:44:15.0526 (UTC) X-MS-Exchange-CrossTenant-fromentityheader: Hosted X-MS-Exchange-CrossTenant-id: f34e5979-57d9-4aaa-ad4d-b122a662184d X-MS-Exchange-Transport-CrossTenantHeadersStamped: VI1PR08MB1165 X-MC-Unique: e9p-KXg3QI6gIZMFOLhdKQ-1 ping diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S index 51e7268..6b8610e 100644 --- a/sysdeps/aarch64/memcpy.S +++ b/sysdeps/aarch64/memcpy.S @@ -35,6 +35,7 @@ #define A_h x7 #define A_hw w7 #define B_l x8 +#define B_lw w8 #define B_h x9 #define C_l x10 #define C_h x11 @@ -70,21 +71,40 @@ END (memmove) libc_hidden_builtin_def (memmove) ENTRY (memcpy) + prfm PLDL1KEEP, [src] add srcend, src, count add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) cmp count, 96 b.hi L(copy_long) - cmp count, 16 - b.hs L(copy_medium) + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 /* Small copies: 0..16 bytes. */ L(copy16): - tbz count, 3, 1f + cmp count, 8 + b.lo 1f ldr A_l, [src] ldr A_h, [srcend, -8] str A_l, [dstin] str A_h, [dstend, -8] ret + .p2align 4 1: tbz count, 2, 1f ldr A_lw, [src] @@ -92,33 +112,21 @@ L(copy16): str A_lw, [dstin] str A_hw, [dstend, -4] ret - .p2align 4 + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1: cbz count, 2f + lsr tmp1, count, 1 ldrb A_lw, [src] - tbz count, 1, 1f - ldrh A_hw, [srcend, -2] - strh A_hw, [dstend, -2] -1: strb A_lw, [dstin] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] 2: ret .p2align 4 - /* Medium copies: 17..96 bytes. */ -L(copy_medium): - ldp A_l, A_h, [src] - tbnz count, 6, L(copy96) - ldp D_l, D_h, [srcend, -16] - tbz count, 5, 1f - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] -1: - stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] - ret - - .p2align 4 /* Copy 64..96 bytes. Copy 64 bytes from the start and 32 bytes from the end. */ L(copy96):