From patchwork Fri Sep 28 18:08:46 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Anton Youdkevitch X-Patchwork-Id: 29574 Received: (qmail 112490 invoked by alias); 28 Sep 2018 18:08:57 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 112480 invoked by uid 89); 28 Sep 2018 18:08:56 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-27.1 required=5.0 tests=BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_NUMSUBJECT, RCVD_IN_DNSWL_LOW, SPF_PASS autolearn=ham version=3.3.2 spammy=separated, forming, pipelined, 1732 X-HELO: forward101j.mail.yandex.net DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=bell-sw.com; s=mail; t=1538158127; bh=A4SQfBHSVMgnMaEv6nAH7g1vR2B3jHYdK4ysj3QRn6k=; h=From:Subject:To:Message-ID:Date; b=mMlJINUbO/GcZdfS0sIYIXeSKf5HOd6jl7npvlhFfR2JQ58lIxzpyrWMDJ2swG4V9 XOp6jIDpAvLl3mernkaUiy97N8JeYcE+u8taJoobGwpJm/Z6hhfRydfwqUtxHhX88b Mey5ZheAOigO6UOCmgfw2kvwpKP89uPUJEE41m3E= DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=bell-sw.com; s=mail; t=1538158126; bh=A4SQfBHSVMgnMaEv6nAH7g1vR2B3jHYdK4ysj3QRn6k=; h=From:Subject:To:Message-ID:Date; b=e7MTZVtSUtlveCo75DkzckJr4J28Vs1aVdwkhvjTFjopwVYZptiVPMk4LX/TlXT3n XJsJp5XUmJhpltmxbtG4glsyXPxHVoXGlvwxn7H0Xk2X3OPpwdRx/Xubw72sySxsDk SWaQw28nLq5G2L/0uYJoFR8KLLIpEixaxwX/5v3Y= Authentication-Results: smtp4p.mail.yandex.net; dkim=pass header.i=@bell-sw.com From: Anton Youdkevitch Subject: [PATCH] aarch64: optimized memcpy implementation for thunderx2 To: libc-alpha@sourceware.org Message-ID: Date: Fri, 28 Sep 2018 21:08:46 +0300 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Thunderbird/52.9.1 MIME-Version: 1.0 Optimized memcpy implementation using "ext" instruction. The speedup is up to 30% on larger lengths comparing to the existing thunderx2 implementation. Performance comparison is done using the standard lib's benchmarks. STREAM benchmark improved ~3% comparing to it being compiled with '-fno-builtin' flag. OK? diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S index de494d9..6000365 100644 --- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S +++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S @@ -74,13 +74,10 @@ #if IS_IN (libc) -# ifndef USE_THUNDERX2 # undef MEMCPY # define MEMCPY __memcpy_thunderx # undef MEMMOVE # define MEMMOVE __memmove_thunderx -# define USE_THUNDERX -# endif ENTRY_ALIGN (MEMMOVE, 6) @@ -182,8 +179,6 @@ L(copy96): .p2align 4 L(copy_long): -# if defined(USE_THUNDERX) || defined (USE_THUNDERX2) - /* On thunderx, large memcpy's are helped by software prefetching. This loop is identical to the one below it but with prefetching instructions included. For loops that are less than 32768 bytes, @@ -196,11 +191,7 @@ L(copy_long): bic dst, dstin, 15 ldp D_l, D_h, [src] sub src, src, tmp1 -# if defined(USE_THUNDERX) prfm pldl1strm, [src, 384] -# elif defined(USE_THUNDERX2) - prfm pldl1strm, [src, 256] -# endif add count, count, tmp1 /* Count is now 16 too large. */ ldp A_l, A_h, [src, 16] stp D_l, D_h, [dstin] @@ -210,13 +201,9 @@ L(copy_long): subs count, count, 128 + 16 /* Test and readjust count. */ L(prefetch_loop64): -# if defined(USE_THUNDERX) tbz src, #6, 1f prfm pldl1strm, [src, 512] 1: -# elif defined(USE_THUNDERX2) - prfm pldl1strm, [src, 256] -# endif stp A_l, A_h, [dst, 16] ldp A_l, A_h, [src, 16] stp B_l, B_h, [dst, 32] @@ -230,7 +217,6 @@ L(prefetch_loop64): b L(last64) L(copy_long_without_prefetch): -# endif and tmp1, dstin, 15 bic dst, dstin, 15 diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S index 8501abf..dbca92b 100644 --- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S +++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S @@ -20,8 +20,980 @@ /* The actual code in this memcpy and memmove is in memcpy_thunderx.S. The only real differences are with the prefetching instructions. */ +#include + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + * + */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp2 x6 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define A_hw w7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l src +#define E_h count +#define F_l srcend +#define F_h dst +#define G_l count +#define G_h dst +#define tmp1 x14 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 +#define I_q q16 +#define J_q q17 + +#define A_v v0 +#define B_v v1 +#define C_v v2 +#define D_v v3 +#define E_v v4 +#define F_v v5 +#define G_v v6 +#define H_v v7 +#define I_v v16 +#define J_v v17 + +#ifndef MEMMOVE +# define MEMMOVE memmove +#endif +#ifndef MEMCPY +# define MEMCPY memcpy +#endif + +#if IS_IN (libc) + +#undef MEMCPY +#undef MEMMOVE #define MEMCPY __memcpy_thunderx2 #define MEMMOVE __memmove_thunderx2 -#define USE_THUNDERX2 -#include "memcpy_thunderx.S" + +/* Copies are split into 3 main cases: small copies of up to 16 bytes, + medium copies of 17..96 bytes which are fully unrolled. Large copies + of more than 96 bytes align the destination and use an unrolled loop + processing 64 bytes per iteration. + The current optimized memcpy implementation is not compatible with + memmove and is separated from it completely. See below. + Overlapping large forward memmoves use a loop that copies backwards. +*/ + +ENTRY_ALIGN (MEMMOVE, 6) + + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + sub tmp1, dstin, src + cmp count, 96 + ccmp tmp1, count, 2, hi + b.lo L(move_long) + + prfm PLDL1KEEP, [src] + add srcend, src, count + add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) + cmp count, 96 + b.hi L(copy_long) + + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Small copies: 0..16 bytes. */ +L(copy16): + cmp count, 8 + b.lo 1f + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + .p2align 4 +1: + tbz count, 2, 1f + ldr A_lw, [src] + ldr A_hw, [srcend, -4] + str A_lw, [dstin] + str A_hw, [dstend, -4] + ret + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ +1: + cbz count, 2f + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] +2: ret + + .p2align 4 + /* Copy 64..96 bytes. Copy 64 bytes from the start and + 32 bytes from the end. */ +L(copy96): + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [src, 32] + ldp D_l, D_h, [src, 48] + ldp E_l, E_h, [srcend, -32] + ldp F_l, F_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin, 32] + stp D_l, D_h, [dstin, 48] + stp E_l, E_h, [dstend, -32] + stp F_l, F_h, [dstend, -16] + ret + + /* Align DST to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + .p2align 4 +L(copy_long): + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(last64) +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the end even if + there is just 1 byte left. */ +L(last64): + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 +L(move_long): + cbz tmp1, 3f + + add srcend, src, count + add dstend, dstin, count + + /* Align dstend to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + and tmp1, dstend, 15 + ldp D_l, D_h, [srcend, -16] + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls 2f + + nop +1: + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the start even if + there is just 1 byte left. */ +2: + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] +3: ret + +END (MEMMOVE) +libc_hidden_builtin_def (MEMMOVE) + + +/* memcpy implementation below is not compatible with memmove + because of pipelined loads/stores, which are faster, but they + can't be used in the case of overlapping memmove arrays */ + +#define MEMCPY_PREFETCH_LDR 640 + +ENTRY (MEMCPY) + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + add srcend, src, count + cmp count, 16 + b.ls L(memcopy16) + ldr A_q, [src], #16 + add dstend, dstin, count + and tmp1, src, 15 + cmp count, 96 + b.hi L(memcopy_long) + + /* Medium copies: 17..96 bytes. */ + ldr E_q, [srcend, -16] + cmp count, 64 + b.gt L(memcpy_copy96) + cmp count, 48 + b.le L(bytes_17_to_48) + /* 49..64 bytes */ + ldp B_q, C_q, [src] + str E_q, [dstend, -16] + stp A_q, B_q, [dstin] + str C_q, [dstin, 32] + ret + +L(bytes_17_to_48): + /* 17..48 bytes*/ + cmp count, 32 + b.gt L(bytes_32_to_48) + /* 17..32 bytes*/ + str A_q, [dstin] + str E_q, [dstend, -16] + ret + +L(bytes_32_to_48): + /* 32..48 */ + ldr B_q, [src] + str A_q, [dstin] + str E_q, [dstend, -16] + str B_q, [dstin, 16] + ret + + .p2align 4 + /* Small copies: 0..16 bytes. */ +L(memcopy16): + cmp count, 8 + b.lo L(bytes_0_to_8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + add dstend, dstin, count + str A_l, [dstin] + str A_h, [dstend, -8] + ret + .p2align 4 + +L(bytes_0_to_8): + tbz count, 2, L(bytes_0_to_3) + ldr A_lw, [src] + ldr A_hw, [srcend, -4] + add dstend, dstin, count + str A_lw, [dstin] + str A_hw, [dstend, -4] + ret + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ +L(bytes_0_to_3): + cbz count, L(end) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb A_hw, [srcend, -1] + add dstend, dstin, count + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] +L(end): ret + + .p2align 4 + +L(memcpy_copy96): + /* Copying 65..96 bytes. A_q (first 16 bytes) and + E_q(last 16 bytes) are already loaded. + + The size is large enough to benefit from aligned + loads */ + bic src, src, 15 + ldp B_q, C_q, [src] + str A_q, [dstin] + /* Loaded 64 bytes, second 16-bytes chunk can be + overlapping with the first chunk by tmp1 bytes. + Stored 16 bytes. */ + sub dst, dstin, tmp1 + add count, count, tmp1 + /* the range of count being [65..96] becomes [65..111] + after tmp [0..15] gets added to it, + count now is +48 */ + cmp count, 80 + b.gt L(copy96_medium) + ldr D_q, [src, 32] + stp B_q, C_q, [dst, 16] + str E_q, [dstend, -16] + str D_q, [dst, 48] + ret + + .p2align 4 +L(copy96_medium): + ldp D_q, A_q, [src, 32] + str B_q, [dst, 16] + cmp count, 96 + b.gt L(copy96_large) + str E_q, [dstend, -16] + stp C_q, D_q, [dst, 32] + str A_q, [dst, 64] + ret + +L(copy96_large): + ldr F_q, [src, 64] + stp C_q, D_q, [dst, 32] + str E_q, [dstend, -16] + stp A_q, F_q, [dst, 64] + ret + + .p2align 4 +L(memcopy_long): + bic src, src, 15 + ldp B_q, C_q, [src], #32 + str A_q, [dstin] + sub dst, dstin, tmp1 + add count, count, tmp1 + add dst, dst, 16 + and tmp1, dst, 15 + ldp D_q, E_q, [src], #32 + str B_q, [dst], #16 + + /* Already loaded 64+16 bytes. Check if at + least 64 more bytes left */ + subs count, count, 64+64+16 + b.lt L(loop128_exit2) + cmp count, MEMCPY_PREFETCH_LDR + 64 + 32 + b.lt L(loop128) + cbnz tmp1, L(dst_unaligned) + sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32 + + .p2align 4 + +L(loop128_prefetch): + str C_q, [dst], #16 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + str D_q, [dst], #16 + ldp F_q, G_q, [src], #32 + str E_q, [dst], #16 + ldp H_q, A_q, [src], #32 + str F_q, [dst], #16 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + str G_q, [dst], #16 + ldp B_q, C_q, [src], #32 + str H_q, [dst], #16 + ldp D_q, E_q, [src], #32 + stp A_q, B_q, [dst], #32 + subs count, count, 128 + b.ge L(loop128_prefetch) + +L(preloop128): + add count, count, MEMCPY_PREFETCH_LDR + 64 + 32 + .p2align 4 +L(loop128): + ldp F_q, G_q, [src], #32 + str C_q, [dst], #16 + ldp B_q, A_q, [src], #32 + str D_q, [dst], #16 + stp E_q, F_q, [dst], #32 + stp G_q, B_q, [dst], #32 + subs count, count, 64 + b.lt L(loop128_exit1) +L(loop128_proceed): + ldp B_q, C_q, [src], #32 + str A_q, [dst], #16 + ldp D_q, E_q, [src], #32 + str B_q, [dst], #16 + subs count, count, 64 + b.ge L(loop128) + + .p2align 4 +L(loop128_exit2): + stp C_q, D_q, [dst], #32 + str E_q, [dst], #16 + b L(copy_long_check32); + +L(loop128_exit1): + /* A_q is still not stored and 0..63 bytes left, + so, count is -64..-1. + Check if less than 32 bytes left (count < -32) */ + str A_q, [dst], #16 +L(copy_long_check32): + cmn count, 64 + b.eq L(copy_long_done) + cmn count, 32 + b.le L(copy_long_last32) + ldp B_q, C_q, [src] + stp B_q, C_q, [dst] + +L(copy_long_last32): + ldp F_q, G_q, [srcend, -32] + stp F_q, G_q, [dstend, -32] + +L(copy_long_done): + ret + +L(dst_unaligned): + /* For the unaligned store case the code loads two + aligned chunks and then merges them using ext + instrunction. This can be up to 30% faster than + the the simple unaligned store access. + + Current state: tmp1 = dst % 16; C_q, D_q, E_q + contains data yet to be stored. src and dst points + to next-to-be-processed data. A_q, B_q contains + data already stored before, count = bytes left to + be load decremented by 64. + + The control is passed here if at least 64 bytes left + to be loaded. The code does two aligned loads and then + extracts (16-tmp1) bytes from the first register and + tmp1 bytes from the next register forming the value + for the aligned store. + + As ext instruction can only have it's index encoded + as immediate 15 cases process each possible index + value. Computed goto is used to reach the required + code. */ + + /* Store the 16 bytes to dst and align dst for further + operations, several bytes will be stored at this + address once more */ + str C_q, [dst], #16 + ldp F_q, G_q, [src], #32 + bic dst, dst, 15 + adr tmp2, L(load_and_merge) + add tmp2, tmp2, tmp1, LSL 7 + sub tmp2, tmp2, 128 + br tmp2 + +.p2align 7 +L(load_and_merge): +#define EXT_SIZE 1 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +.p2align 7 +#define EXT_SIZE 2 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +.p2align 7 +#define EXT_SIZE 3 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +.p2align 7 +#define EXT_SIZE 4 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +.p2align 7 +#define EXT_SIZE 5 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +.p2align 7 +#define EXT_SIZE 6 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +.p2align 7 +#define EXT_SIZE 7 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +.p2align 7 +#define EXT_SIZE 8 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +.p2align 7 +#define EXT_SIZE 9 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +.p2align 7 +#define EXT_SIZE 10 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +.p2align 7 +#define EXT_SIZE 11 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +.p2align 7 +#define EXT_SIZE 12 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +.p2align 7 +#define EXT_SIZE 13 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +.p2align 7 +#define EXT_SIZE 14 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +.p2align 7 +#define EXT_SIZE 15 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE + subs count, count, 32 + b.lt 2f +1: + stp A_q, B_q, [dst], #32 + prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] + ldp D_q, J_q, [src], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + mov C_v.16b, G_v.16b + stp H_q, I_q, [dst], #32 + ldp F_q, G_q, [src], #32 + ext A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE + ext B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE + mov E_v.16b, J_v.16b + subs count, count, 64 + b.ge 1b +2: + stp A_q, B_q, [dst], #32 + ext H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE + ext I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE + stp H_q, I_q, [dst], #16 + add dst, dst, tmp1 + str G_q, [dst], #16 + b L(copy_long_check32) +#undef EXT_SIZE + +END (MEMCPY) +libc_hidden_builtin_def (MEMCPY) +#endif