From patchwork Thu Nov 19 12:34:54 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Wilco Dijkstra X-Patchwork-Id: 9738 Received: (qmail 86191 invoked by alias); 19 Nov 2015 12:35:06 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 86172 invoked by uid 89); 19 Nov 2015 12:35:05 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.6 required=5.0 tests=AWL, BAYES_00, SPF_PASS autolearn=ham version=3.3.2 X-HELO: eu-smtp-delivery-143.mimecast.com From: "Wilco Dijkstra" To: "'GNU C Library'" Subject: [PATCH][AArch64] Tune memcpy Date: Thu, 19 Nov 2015 12:34:54 -0000 Message-ID: <000801d122c6$b167bd90$143738b0$@arm.com> MIME-Version: 1.0 X-MC-Unique: xu_JbUOjStGaYsqYMwo6Mw-1 This patch further tunes memcpy - avoid one branch for sizes 1-3, add a prefetch and improve small copies that are exact powers of 2. OK for commit? (depends on https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html ) ChangeLog: 2015-11-19 Wilco Dijkstra * sysdeps/aarch64/memcpy.S (memcpy): Further tuning for performance. --- sysdeps/aarch64/memcpy.S | 56 +++++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S index 51e7268..6b8610e 100644 --- a/sysdeps/aarch64/memcpy.S +++ b/sysdeps/aarch64/memcpy.S @@ -35,6 +35,7 @@ #define A_h x7 #define A_hw w7 #define B_l x8 +#define B_lw w8 #define B_h x9 #define C_l x10 #define C_h x11 @@ -70,21 +71,40 @@ END (memmove) libc_hidden_builtin_def (memmove) ENTRY (memcpy) + prfm PLDL1KEEP, [src] add srcend, src, count add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) cmp count, 96 b.hi L(copy_long) - cmp count, 16 - b.hs L(copy_medium) + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 /* Small copies: 0..16 bytes. */ L(copy16): - tbz count, 3, 1f + cmp count, 8 + b.lo 1f ldr A_l, [src] ldr A_h, [srcend, -8] str A_l, [dstin] str A_h, [dstend, -8] ret + .p2align 4 1: tbz count, 2, 1f ldr A_lw, [src] @@ -92,33 +112,21 @@ L(copy16): str A_lw, [dstin] str A_hw, [dstend, -4] ret - .p2align 4 + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1: cbz count, 2f + lsr tmp1, count, 1 ldrb A_lw, [src] - tbz count, 1, 1f - ldrh A_hw, [srcend, -2] - strh A_hw, [dstend, -2] -1: strb A_lw, [dstin] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] 2: ret .p2align 4 - /* Medium copies: 17..96 bytes. */ -L(copy_medium): - ldp A_l, A_h, [src] - tbnz count, 6, L(copy96) - ldp D_l, D_h, [srcend, -16] - tbz count, 5, 1f - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] -1: - stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] - ret - - .p2align 4 /* Copy 64..96 bytes. Copy 64 bytes from the start and 32 bytes from the end. */ L(copy96):