From patchwork Thu Jun 22 18:58:49 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Siddhesh Poyarekar X-Patchwork-Id: 21208 Received: (qmail 69012 invoked by alias); 22 Jun 2017 18:59:04 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 68947 invoked by uid 89); 22 Jun 2017 18:59:02 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-26.1 required=5.0 tests=BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE, SPF_NEUTRAL autolearn=ham version=3.3.2 spammy=0.6, deducted X-HELO: homiemail-a50.g.dreamhost.com From: Siddhesh Poyarekar To: libc-alpha@sourceware.org Subject: [PATCH] aarch64: Optimized memcpy for Qualcomm Falkor processor Date: Fri, 23 Jun 2017 00:28:49 +0530 Message-Id: <1498157929-23554-1-git-send-email-siddhesh@sourceware.org> This is an optimized memcpy implementation for the Qualcomm Falkor processor. The implementation improves specINT in SPEC2006 by 0.6% with omnetpp and xalancbmk leading at 6% and the overall impact being mostly positive on all benchmarks. With the glibc microbenchmarks the large copy benchmarks suffer slightly but bench-memcpy-random improves throughout by about 5%. * sysdeps/aarch64/multiarch/Makefile (sysdep-routines): Add memcpy_falkor. * sysdeps/aarch64/multiarch/ifunc-impl-list.c (MAX_IFUNC): Bump. (__libc_ifunc_impl_list): Add __memcpy_falkor. * sysdeps/aarch64/multiarch/memcpy.c (libc_ifunc): Likewise. * sysdeps/unix/sysv/linux/aarch64/cpu-features.h (IS_FALKOR): New macro. * sysdeps/aarch64/multiarch/memcpy_falkor.S: New file. --- sysdeps/aarch64/multiarch/Makefile | 2 +- sysdeps/aarch64/multiarch/ifunc-impl-list.c | 3 +- sysdeps/aarch64/multiarch/memcpy.c | 7 +- sysdeps/aarch64/multiarch/memcpy_falkor.S | 294 +++++++++++++++++++++++++ sysdeps/unix/sysv/linux/aarch64/cpu-features.h | 3 + 5 files changed, 306 insertions(+), 3 deletions(-) create mode 100644 sysdeps/aarch64/multiarch/memcpy_falkor.S diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 78d52c7..164ba1a 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -1,3 +1,3 @@ ifeq ($(subdir),string) -sysdep_routines += memcpy_generic memcpy_thunderx +sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index 32056bc..8e873b3 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -25,7 +25,7 @@ #include /* Maximum number of IFUNC implementations. */ -#define MAX_IFUNC 2 +#define MAX_IFUNC 3 size_t __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, @@ -40,6 +40,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */ IFUNC_IMPL (i, name, memcpy, IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) IFUNC_IMPL (i, name, memmove, IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index 9f73efb..b395df1 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -30,9 +30,14 @@ extern __typeof (__redirect_memcpy) __libc_memcpy; extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; libc_ifunc (__libc_memcpy, - IS_THUNDERX (midr) ? __memcpy_thunderx : __memcpy_generic); + (IS_THUNDERX (midr) + ? __memcpy_thunderx + : (IS_FALKOR (midr) + ? __memcpy_falkor + : __memcpy_generic))); # undef memcpy strong_alias (__libc_memcpy, memcpy); diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S new file mode 100644 index 0000000..414adb4 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S @@ -0,0 +1,294 @@ +/* A Generic Optimized memcpy implementation for AARCH64. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + * + */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define A_hw w7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l src +#define E_h count +#define F_l srcend +#define F_h dst +#define tmp1 x14 +#define res x15 + +#include + +/* Copy 64 bytes at a time and branch to LABEL on COND. */ +.macro copy_line_and_branch cond, label + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.\cond \label +.endm + +/* Copies are split into 3 main cases: small copies of up to 16 bytes, + medium copies of 17..96 bytes which are fully unrolled. Large copies + of more than 96 bytes align the destination and use an unrolled loop + processing 64 bytes per iteration. + Small and medium copies read all data before writing, allowing any + kind of overlap, and memmove tailcalls memcpy for these cases as + well as non-overlapping copies. */ + +ENTRY_ALIGN (__memcpy_falkor, 6) + + prfm PLDL1KEEP, [src] + add srcend, src, count + add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) + cmp count, 96 + b.hi L(copy_long) + + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Small copies: 0..16 bytes. */ +L(copy16): + cmp count, 8 + b.lo 1f + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + .p2align 4 +1: + tbz count, 2, 1f + ldr A_lw, [src] + ldr A_hw, [srcend, -4] + str A_lw, [dstin] + str A_hw, [dstend, -4] + ret + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ +1: + cbz count, 2f + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] +2: ret + + .p2align 4 + /* Copy 64..96 bytes. Copy 64 bytes from the start and + 32 bytes from the end. */ +L(copy96): + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [src, 32] + ldp D_l, D_h, [src, 48] + ldp E_l, E_h, [srcend, -32] + ldp F_l, F_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin, 32] + stp D_l, D_h, [dstin, 48] + stp E_l, E_h, [dstend, -32] + stp F_l, F_h, [dstend, -16] + ret + + /* Align DST to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + .p2align 4 +L(copy_long): + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls 5f + + /* Unroll the copy for 512 bytes. We do this so that smaller copies + don't get penalized by the extra checks we do for larger sizes + further down. */ + copy_line_and_branch ls, 5f + copy_line_and_branch ls, 5f + copy_line_and_branch ls, 5f + copy_line_and_branch ls, 5f + copy_line_and_branch ls, 5f + copy_line_and_branch ls, 5f + copy_line_and_branch ls, 5f + copy_line_and_branch ls, 5f + + /* If less than 2048 bytes remain, jump to the final loop and finish + off the copy. Otherwise, keep the last 2048 bytes for the final + loop and try a couple of prefetching loops to optimize cache + usage. */ + subs count, count, 2048 + b.hi 3f + +6: + /* 2048 bytes or less remaining, adjust COUNT and copy 64 bytes at a + time. */ + add count, count, 2048 +4: + copy_line_and_branch hi, 4b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the end even if + there is just 1 byte left. */ +5: + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + /* Find an offset within the source that operates on a memory bank + other than the one used by the source and destination pointers. If + we find one within the 1K to 4K range, then we can prefetch at two + offsets to stream more data in from the source. This computation + formula is provided by Qualcomm. */ +3: + sub res, src, dst + sub res, res, 1024 + and res, res, 0x7ff + subs tmp1, count, res + b.hi 7f + /* We still have more than 2K bytes remaining, so copy 128 bytes at a + time, prefetching at 2K-128 for every iteration until there is less + than 2K left. That way we make future data available in L1 and at + the same time, limit our prefetch to within the source data. */ +2: + prfm PLDL1STRM, [src, 1920] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64] + ldp D_l, D_h, [src, 64] + stp A_l, A_h, [dst, 80] + ldp A_l, A_h, [src, 80] + stp B_l, B_h, [dst, 96] + ldp B_l, B_h, [src, 96] + stp C_l, C_h, [dst, 112] + ldp C_l, C_h, [src, 112] + stp D_l, D_h, [dst, 128]! + ldp D_l, D_h, [src, 128]! + subs count, count, 128 + b.hi 2b + b 6b + +7: + add res, res, 2048 + + /* We found an appropriate offset. Copy 128 bytes at a time, + prefetching at 2K and the computed offset while the computed offset + is within the source data. */ +1: + prfm PLDL1STRM, [src, 2048] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64] + ldp D_l, D_h, [src, 64] + prfm PLDL1STRM, [src, res] + stp A_l, A_h, [dst, 80] + ldp A_l, A_h, [src, 80] + stp B_l, B_h, [dst, 96] + ldp B_l, B_h, [src, 96] + stp C_l, C_h, [dst, 112] + ldp C_l, C_h, [src, 112] + stp D_l, D_h, [dst, 128]! + ldp D_l, D_h, [src, 128]! + subs tmp1, tmp1, 128 + b.hi 1b + + /* Update count once the loop is done. Subtract the 2048 added to RES + for the prefetch offset to account for the 2048 we deducted from + COUNT earlier. We will have copied: + + (COUNT - RES) - TMP1 + + bytes so the remaining size is COUNT - (COUNT - RES - TMP1) + + i.e. RES + TMP1. If COUNT is non-positive then we have 2048 bytes + or less remaining and we jump forward to the end. Otherwise we + cascade into the loop below with a single prefetch before cascading + further into the last loop. */ + sub res, res, 2048 + adds count, res, tmp1 + b.ls 6b + b 2b + +END (__memcpy_falkor) +libc_hidden_builtin_def (__memcpy_falkor) diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h index c92b650..73cb53d 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h @@ -41,6 +41,9 @@ #define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \ && MIDR_PARTNUM(midr) == 0x0a1) +#define IS_FALKOR(midr) (MIDR_IMPLEMENTOR(midr) == 'Q' \ + && MIDR_PARTNUM(midr) == 0xc00) + struct cpu_features { uint64_t midr_el1;