From patchwork Thu Aug 27 16:28:36 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ondrej Bilka X-Patchwork-Id: 8485 Received: (qmail 130292 invoked by alias); 27 Aug 2015 16:29:05 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 130275 invoked by uid 89); 27 Aug 2015 16:29:04 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=0.3 required=5.0 tests=AWL, BAYES_20, FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2 X-HELO: popelka.ms.mff.cuni.cz Date: Thu, 27 Aug 2015 18:28:36 +0200 From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= To: libc-alpha@sourceware.org Cc: "H.J. Lu" Subject: [RFC PATCH] Optimize memset with conditional moves. Message-ID: <20150827162836.GA28702@domone> MIME-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.5.20 (2009-06-14) Hi, I got idea that I mentioned on aarch thread to improve memset and memcpy performance by using conditional moves to avoid branch misprediction penalty. Previously setting one byte was very slow as we needed decision tree to determine implementation for each size and for some range it needed to be slow so we selected this one as unlikely. A trick that I use could be done in C to handle n less than 16 char stack[64]; uint64_t *p1 = (uint64_t *) s; uint32_t *p2 = (uint32_t *) (s + (n & 8)); uint16_t *p3 = (uint16_t *) (s + (n & 12)); char *p4 = s + n - 1; if (n & 8 == 0) p1 = stack; if (n & 4 == 0) p2 = stack; if (n & 2 == 0) p3 = stack; if (n & 1 == 0) p4 = stack; *p1 = vc; *p2 = vc; *p3 = vc; *p4 = vc; and for 32-128 byte range its same as what in 16-64 range you could do with 8 byte moves char stack[64]; int add = n > 32 ? 16 : 0; int sub = n > 32 ? -16 : 0; *((uint64_t *)(s)) = vc; *((uint64_t *)(s + 8)) = vc; *((uint64_t *)(s + add)) = vc; *((uint64_t *)(s + 8 + add)) = vc; *((uint64_t *)(s + n - 16)) = vc; *((uint64_t *)(s + 8 + n - 16)) = vc; *((uint64_t *)(s + n - 16 + sub)) = vc; *((uint64_t *)(s + 8 + n - 16 + sub)) = vc; This could be used generically, I believe that on arm it could give considerable speedup but would need access to machine for testing. This gives on average 5% over current implementation, see profile results here, I will resend alternate rep stosq implementation later, see following: http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_cmov.html http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_cmov270815.tar.bz2 That as I recall was where I left memset as I didn't know on which sizes to select it as it needs to be set empirically and is different for each architecture. Problem is that on most architectures its faster when data are in L3 cache or main memory and I dont know since what sizes it usually happens as it may be case in lot smaller size than cache size limit. I also included optimization of counted loop, which helps processor to correctly predict end until 512 bytes. Second one is that I extended header to sizes upto 256 bytes which gives additional one percent or so. There is additional microoptimization possible that I could save few cycles with ssse3 pshufb so should I add another variant there surrounded by standard #ifdef USE_SSSE3 pattern? I didn't do more performance testing yet as I plan to use this pattern also for memcpy and strcpy. So comments? * sysdeps/x86_64/memset.S: Use conditional moves to improve performance. diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index e496254..bea402d 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -24,7 +24,7 @@ ENTRY(__bzero) movq %rdi, %rax /* Set return value. */ movq %rsi, %rdx /* Set n. */ - pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 jmp L(entry_from_bzero) END(__bzero) weak_alias (__bzero, bzero) @@ -33,10 +33,10 @@ weak_alias (__bzero, bzero) ENTRY(__memset_tail) movq %rcx, %rax /* Set return value. */ - movd %esi, %xmm8 - punpcklbw %xmm8, %xmm8 - punpcklwd %xmm8, %xmm8 - pshufd $0, %xmm8, %xmm8 + movd %esi, %xmm0 + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0 jmp L(entry_from_bzero) END(__memset_tail) @@ -50,78 +50,116 @@ END_CHK (__memset_chk) #endif ENTRY (memset) - movd %esi, %xmm8 + movd %esi, %xmm0 movq %rdi, %rax - punpcklbw %xmm8, %xmm8 - punpcklwd %xmm8, %xmm8 - pshufd $0, %xmm8, %xmm8 + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0 L(entry_from_bzero): - cmpq $64, %rdx + cmpq $256, %rdx ja L(loop_start) - cmpq $16, %rdx - jbe L(less_16_bytes) - cmpq $32, %rdx - movdqu %xmm8, (%rdi) - movdqu %xmm8, -16(%rdi,%rdx) - ja L(between_32_64_bytes) -L(return): - rep + cmp $128, %edx + jae L(between_128_256_bytes) + + cmp $32, %edx + jb L(less_32_bytes) + + lea -32(%rdi), %rcx + lea 32(%rdi), %rsi + + cmp $64, %edx + cmovb %rdi, %rcx + cmovb %rdi, %rsi + + movdqu %xmm0, (%rdi) + movdqu %xmm0, 16(%rdi) + movdqu %xmm0, (%rsi) + movdqu %xmm0, 16(%rsi) + + movdqu %xmm0, -32(%rcx, %rdx) + movdqu %xmm0, -16(%rcx, %rdx) + movdqu %xmm0, -32(%rdi, %rdx) + movdqu %xmm0, -16(%rdi, %rdx) ret + .p2align 4 -L(between_32_64_bytes): - movdqu %xmm8, 16(%rdi) - movdqu %xmm8, -32(%rdi,%rdx) +L(between_128_256_bytes): + lea (%rdi, %rdx), %rsi + movdqu %xmm0, (%rdi) + movdqu %xmm0, 16(%rdi) + movdqu %xmm0, 32(%rdi) + movdqu %xmm0, 48(%rdi) + movdqu %xmm0, 64(%rdi) + movdqu %xmm0, 80(%rdi) + movdqu %xmm0, 96(%rdi) + movdqu %xmm0, 112(%rdi) + movdqu %xmm0, -128(%rsi) + movdqu %xmm0, -112(%rsi) + movdqu %xmm0, -96(%rsi) + movdqu %xmm0, -80(%rsi) + movdqu %xmm0, -64(%rsi) + movdqu %xmm0, -48(%rsi) + movdqu %xmm0, -32(%rsi) + movdqu %xmm0, -16(%rsi) ret + .p2align 4 L(loop_start): - leaq 64(%rdi), %rcx - movdqu %xmm8, (%rdi) - andq $-64, %rcx - movdqu %xmm8, -16(%rdi,%rdx) - movdqu %xmm8, 16(%rdi) - movdqu %xmm8, -32(%rdi,%rdx) - movdqu %xmm8, 32(%rdi) - movdqu %xmm8, -48(%rdi,%rdx) - movdqu %xmm8, 48(%rdi) - movdqu %xmm8, -64(%rdi,%rdx) - addq %rdi, %rdx - andq $-64, %rdx - cmpq %rdx, %rcx - je L(return) - .p2align 4 + lea (%rdi, %rdx), %rsi + leaq 16(%rdi), %rcx + andq $-16, %rcx + movdqu %xmm0, -16(%rsi) + movdqu %xmm0, -32(%rsi) + movdqu %xmm0, -48(%rsi) + movdqu %xmm0, -64(%rsi) + movdqu %xmm0, (%rdi) + subq %rcx, %rsi + shrq $6, %rsi L(loop): - movdqa %xmm8, (%rcx) - movdqa %xmm8, 16(%rcx) - movdqa %xmm8, 32(%rcx) - movdqa %xmm8, 48(%rcx) + movdqa %xmm0, (%rcx) + movdqa %xmm0, 16(%rcx) + movdqa %xmm0, 32(%rcx) + movdqa %xmm0, 48(%rcx) addq $64, %rcx - cmpq %rcx, %rdx + subq $1, %rsi jne L(loop) - rep - ret -L(less_16_bytes): - movq %xmm8, %rcx - testb $24, %dl - jne L(between8_16bytes) - testb $4, %dl - jne L(between4_7bytes) - testb $1, %dl - je L(odd_byte) - movb %cl, (%rdi) -L(odd_byte): - testb $2, %dl - je L(return) - movw %cx, -2(%rax,%rdx) - ret -L(between4_7bytes): - movl %ecx, (%rdi) - movl %ecx, -4(%rdi,%rdx) - ret -L(between8_16bytes): - movq %rcx, (%rdi) - movq %rcx, -8(%rdi,%rdx) ret +L(less_32_bytes): + movq %xmm0, %rcx + lea -64(%rsp), %r10 + sub %rax, %r10 + lea 64(%rdi), %r9 + lea 63(%rdi, %rdx), %rsi + mov %rdx, %r8 + mov %rdx, %r11 + mov %rdx, %rdi + and $16, %r8 + and $24, %rdi + and $28, %r11 + + test $16, %edx + cmove %rsp, %r9 + + test $8, %edx + cmove %r10, %r8 + + test $4, %edx + cmove %r10, %rdi + + test $2, %edx + cmove %r10, %r11 + + test $1, %edx + cmove %rsp, %rsi + + movdqu %xmm0, -64(%r9) + movq %rcx, (%rax, %r8) + movl %ecx, (%rax, %rdi) + movw %cx, (%rax, %r11) + movb %cl, -64(%rsi) + + ret END (memset) libc_hidden_builtin_def (memset)