From patchwork Wed Dec 17 12:12:25 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Richard Earnshaw X-Patchwork-Id: 4298 Received: (qmail 32108 invoked by alias); 17 Dec 2014 12:12:33 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 32092 invoked by uid 89); 17 Dec 2014 12:12:31 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.0 required=5.0 tests=AWL, BAYES_00, SPF_PASS autolearn=ham version=3.3.2 X-HELO: service87.mimecast.com Message-ID: <54917329.4090601@arm.com> Date: Wed, 17 Dec 2014 12:12:25 +0000 From: Richard Earnshaw User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.3.0 MIME-Version: 1.0 To: Glibc Development List Subject: [Patch, AArch64] Optimized strcpy X-MC-Unique: 114121712122615701 This patch contains an optimized implementation of strcpy for AArch64 systems. Benchmarking shows that it is approximately 20-25% faster than the generic implementation across the board. R. Richard Earnshaw * sysdeps/aarch64/strcpy.S: New file. diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S new file mode 100644 index 0000000..1cdf2a1 --- /dev/null +++ b/sysdeps/aarch64/strcpy.S @@ -0,0 +1,202 @@ +/* Copyright (C) 2013-2014 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses + */ + +/* Arguments and results. */ +#define dstin x0 +#define src x1 + +/* Locals and temporaries. */ +#define dst x2 +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define has_nul1 x5 +#define has_nul2 x6 +#define tmp1 x7 +#define tmp2 x8 +#define tmp3 x9 +#define tmp4 x10 +#define zeroones x11 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + + /* Start of critial section -- keep to one 64Byte cache line. */ +ENTRY_ALIGN (strcpy,6) + mov zeroones, #REP8_01 + mov dst, dstin + ands tmp1, src, #15 + b.ne L(misaligned) + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + /* The inner loop deals with two Dwords at a time. This has a + slightly higher start-up cost, but we should win quite quickly, + especially on cores with a high number of issue slots per + cycle, as we get much better parallelism out of the operations. */ + b L(first_pass) +L(main_loop): + stp data1, data2, [dst], #16 +L(startloop_fast): + ldp data1, data2, [src], #16 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bics has_nul2, tmp3, tmp4 + ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ + b.eq L(main_loop) + /* End of critical section -- keep to one 64Byte cache line. */ + + cbnz has_nul1, L(nul_in_data1_fast) +L(nul_in_data2_fast): + str data1, [dst], #8 +L(nul_in_data2_fast_after_d1): + /* For a NUL in data2, we always know that we've moved at least 8 + bytes, so no need for a slow path. */ +#ifdef __AARCH64EB__ + /* For big-endian only, carry propagation means we can't trust + the MSB of the syndrome value calculated above (the byte + sequence 01 00 will generate a syndrome of 80 80 rather than + 00 80). We get around this by byte-swapping the data and + re-calculating. */ + rev data2, data2 + sub tmp1, data2, zeroones + orr tmp2, data2, #REP8_7f + bic has_nul2, tmp1, tmp2 +#endif + rev has_nul2, has_nul2 + sub src, src, #(8+7) + clz has_nul2, has_nul2 + lsr has_nul2, has_nul2, #3 /* Bits to bytes. */ + sub dst, dst, #7 + ldr data2, [src, has_nul2] + str data2, [dst, has_nul2] + ret + +L(nul_in_data1_fast): + /* Since we know we've already copied at least 8 bytes, we can + safely handle the tail with one misaligned dword move. To do this + we calculate the location of the trailing NUL byte and go seven + bytes back from that. */ +#ifdef __AARCH64EB__ + /* For big-endian only, carry propagation means we can't trust + the MSB of the syndrome value calculated above (the byte + sequence 01 00 will generate a syndrome of 80 80 rather than + 00 80). We get around this by byte-swapping the data and + re-calculating. */ + rev data1, data1 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bic has_nul1, tmp1, tmp2 +#endif + rev has_nul1, has_nul1 + sub src, src, #(16+7) + clz has_nul1, has_nul1 + lsr has_nul1, has_nul1, #3 /* Bits to bytes. */ + sub dst, dst, #7 + ldr data1, [src, has_nul1] + str data1, [dst, has_nul1] + ret + +L(first_pass): + ldp data1, data2, [src], #16 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bics has_nul2, tmp3, tmp4 + ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ + b.eq L(main_loop) + + cbz has_nul1, L(nul_in_data2_fast) +L(nul_in_data1): + /* Slow path. We can't be sure we've moved at least 8 bytes, so + fall back to a slow byte-by byte store of the bits already + loaded. + + The worst case when coming through this path is that we've had + to copy seven individual bytes to get to alignment and we then + have to copy another seven (eight for big-endian) again here. + We could try to detect that case (and any case where more than + eight bytes have to be copied), but it really doesn't seem + worth it. */ +#ifdef __AARCH64EB__ + rev data1, data1 +#else + /* On little-endian, we can easily check if the NULL byte was + in the last byte of the Dword. For big-endian we'd have to + recalculate the syndrome, which is unlikely to be worth it. */ + lsl has_nul1, has_nul1, #8 + cbnz has_nul1, 1f + str data1, [dst] + ret +#endif +1: + strb data1w, [dst], #1 + tst data1, #0xff + lsr data1, data1, #8 + b.ne 1b +L(done): + ret + +L(misaligned): + cmp tmp1, #8 + b.ge 2f + /* There's at least one Dword before we reach alignment, so we can + deal with that efficiently. */ + ldr data1, [src] + bic src, src, #15 + sub tmp3, data1, zeroones + orr tmp4, data1, #REP8_7f + bics has_nul1, tmp3, tmp4 + b.ne L(nul_in_data1) + str data1, [dst], #8 + ldr data2, [src, #8] + add src, src, #16 + sub dst, dst, tmp1 + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bics has_nul2, tmp3, tmp4 + b.ne L(nul_in_data2_fast_after_d1) + str data2, [dst], #8 + /* We can by-pass the first-pass version of the loop in this case + since we know that at least 8 bytes have already been copied. */ + b L(startloop_fast) + +2: + sub tmp1, tmp1, #16 +3: + ldrb data1w, [src], #1 + strb data1w, [dst], #1 + cbz data1w, L(done) + add tmp1, tmp1, #1 + cbnz tmp1, 3b + b L(first_pass) +END (strcpy) +libc_hidden_builtin_def (strcpy)