From patchwork Thu May 3 17:52:08 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Siddhesh Poyarekar X-Patchwork-Id: 27085 Received: (qmail 4962 invoked by alias); 3 May 2018 17:52:36 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 4841 invoked by uid 89); 3 May 2018 17:52:35 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-26.1 required=5.0 tests=BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE, SPF_NEUTRAL autolearn=ham version=3.3.2 spammy=1507, peel, Hx-languages-length:3534 X-HELO: homiemail-a56.g.dreamhost.com From: Siddhesh Poyarekar To: libc-alpha@sourceware.org Subject: [PATCH 1/2] aarch64, falkor: Ignore prefetcher hints for memmove tail Date: Thu, 3 May 2018 23:22:08 +0530 Message-Id: <20180503175209.2943-2-siddhesh@sourceware.org> In-Reply-To: <20180503175209.2943-1-siddhesh@sourceware.org> References: <20180503175209.2943-1-siddhesh@sourceware.org> The tail of the copy loops are unable to train the falkor hardware prefetcher because they load from a different base compared to the hot loop. In this case avoid serializing the instructions by loading them into different registers. Also peel the last iteration of the loop into the tail (and have them use different registers) since it gives better performance for medium sizes. This results in performance improvements of between 3% and 20% over the current falkor implementation for sizes between 128 bytes and 1K on the memmove-walk benchmark, thus mostly covering the regressions seen against the generic memmove. * sysdeps/aarch64/multiarch/memmove_falkor.S (__memmove_falkor): Use multiple registers to move data in loop tail. --- sysdeps/aarch64/multiarch/memmove_falkor.S | 48 ++++++++++++++++++------------ 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/sysdeps/aarch64/multiarch/memmove_falkor.S b/sysdeps/aarch64/multiarch/memmove_falkor.S index 3375adf2de..c0d9560301 100644 --- a/sysdeps/aarch64/multiarch/memmove_falkor.S +++ b/sysdeps/aarch64/multiarch/memmove_falkor.S @@ -150,7 +150,6 @@ L(copy96): .p2align 4 L(copy_long): - sub count, count, 64 + 16 /* Test and readjust count. */ mov B_l, Q_l mov B_h, Q_h ldp A_l, A_h, [src] @@ -161,6 +160,8 @@ L(copy_long): ldp Q_l, Q_h, [src, 16]! stp A_l, A_h, [dstin] ldp A_l, A_h, [src, 16]! + subs count, count, 32 + 64 + 16 /* Test and readjust count. */ + b.ls L(last64) L(loop64): subs count, count, 32 @@ -170,18 +171,22 @@ L(loop64): ldp A_l, A_h, [src, 16]! b.hi L(loop64) - /* Write the last full set of 32 bytes. The remainder is at most 32 - bytes, so it is safe to always copy 32 bytes from the end even if - there is just 1 byte left. */ + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes and at least 33 bytes, so it is safe to always copy 64 bytes + from the end. */ L(last64): - ldp C_l, C_h, [srcend, -32] + ldp C_l, C_h, [srcend, -64] stp Q_l, Q_h, [dst, 16] - ldp Q_l, Q_h, [srcend, -16] + mov Q_l, B_l + mov Q_h, B_h + ldp B_l, B_h, [srcend, -48] stp A_l, A_h, [dst, 32] - stp C_l, C_h, [dstend, -32] - stp Q_l, Q_h, [dstend, -16] - mov Q_l, B_l - mov Q_h, B_h + ldp A_l, A_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + stp C_l, C_h, [dstend, -64] + stp B_l, B_h, [dstend, -48] + stp A_l, A_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] ret .p2align 4 @@ -204,7 +209,8 @@ L(move_long): sub count, count, tmp1 ldp A_l, A_h, [srcend, -16]! sub dstend, dstend, tmp1 - sub count, count, 64 + subs count, count, 32 + 64 + b.ls 2f 1: subs count, count, 32 @@ -214,18 +220,22 @@ L(move_long): ldp A_l, A_h, [srcend, -16]! b.hi 1b - /* Write the last full set of 32 bytes. The remainder is at most 32 - bytes, so it is safe to always copy 32 bytes from the start even if - there is just 1 byte left. */ + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes and at least 33 bytes, so it is safe to always copy 64 bytes + from the start. */ 2: - ldp C_l, C_h, [src, 16] + ldp C_l, C_h, [src, 48] stp Q_l, Q_h, [dstend, -16] - ldp Q_l, Q_h, [src] - stp A_l, A_h, [dstend, -32] - stp C_l, C_h, [dstin, 16] - stp Q_l, Q_h, [dstin] mov Q_l, B_l mov Q_h, B_h + ldp B_l, B_h, [src, 32] + stp A_l, A_h, [dstend, -32] + ldp A_l, A_h, [src, 16] + ldp D_l, D_h, [src] + stp C_l, C_h, [dstin, 48] + stp B_l, B_h, [dstin, 32] + stp A_l, A_h, [dstin, 16] + stp D_l, D_h, [dstin] 3: ret END (__memmove_falkor)