From patchwork Fri Jun 26 07:12:54 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ondrej Bilka X-Patchwork-Id: 7356 Received: (qmail 45165 invoked by alias); 26 Jun 2015 07:13:37 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 45155 invoked by uid 89); 26 Jun 2015 07:13:37 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=0.7 required=5.0 tests=AWL, BAYES_50, FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2 X-HELO: popelka.ms.mff.cuni.cz Date: Fri, 26 Jun 2015 09:12:54 +0200 From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= To: libc-alpha@sourceware.org Subject: [PATCH neleai/string-x64] Reoptimize strlen and strnlen Message-ID: <20150626071254.GA1789@domone> MIME-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.5.20 (2009-06-14) Hi, I optimized strlen long ago, then my main focus was improve performance for core2 and have reasonable performance for athlons and old atoms. Main change is that I check 16-64th byte unaligned instead aligning these to 16 bytes. That improved performance on older processors but now unaligned loads are better on i7. I don't remember if last time I keept xoring first four xmm registers when checking unaligned loads or read from (%rax) instead (%rdi) which increased latency but now simple unaligned loads are faster also on core2 Then I made several microoptimizations like using edx instead rdx to save space or reorder to improve instruction scheduling. Also I tested avx2 version, again it doesn't help much, on haswell performance difference is 0.2% while new sse2 is 1% faster on haswell. Full graphs are here, only problem I could find is 0.3% decrease on fx10. I could reintroduce ifunc to handle atom and avx2 but is that worth it? http://kam.mff.cuni.cz/~ondra/benchmark_string/strlen_profile.html Ok to commit this? * sysdeps/x86_64/strlen.S (strlen): Add microoptimizations. --- sysdeps/x86_64/strlen.S | 336 ++++++++++++++++++++++++------------------------ 1 file changed, 169 insertions(+), 167 deletions(-) diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index c382c8d..3e8beb0 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -1,5 +1,5 @@ /* SSE2 version of strlen. - Copyright (C) 2012-2015 Free Software Foundation, Inc. + Copyright (C) 2012-2015 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,222 +18,224 @@ #include -/* Long lived register in strlen(s), strnlen(s, n) are: - - %xmm11 - zero - %rdi - s - %r10 (s+n) & (~(64-1)) - %r11 s+n -*/ .text ENTRY(strlen) - -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ -#define FIND_ZERO \ - pcmpeqb (%rax), %xmm8; \ - pcmpeqb 16(%rax), %xmm9; \ - pcmpeqb 32(%rax), %xmm10; \ - pcmpeqb 48(%rax), %xmm11; \ - pmovmskb %xmm8, %esi; \ - pmovmskb %xmm9, %edx; \ - pmovmskb %xmm10, %r8d; \ - pmovmskb %xmm11, %ecx; \ - salq $16, %rdx; \ - salq $16, %rcx; \ - orq %rsi, %rdx; \ - orq %r8, %rcx; \ - salq $32, %rcx; \ - orq %rcx, %rdx; - #ifdef AS_STRNLEN -/* Do not read anything when n==0. */ + mov %rsi, %r8 + xor %edx, %edx test %rsi, %rsi - jne L(n_nonzero) - xor %rax, %rax - ret -L(n_nonzero): - -/* Initialize long lived registers. */ - - add %rdi, %rsi - mov %rsi, %r10 - and $-64, %r10 - mov %rsi, %r11 + je L(return_zero) + cmp $64, %rsi + jae L(dont_set) + bts %rsi, %rdx +L(dont_set): #endif - - pxor %xmm8, %xmm8 - pxor %xmm9, %xmm9 - pxor %xmm10, %xmm10 - pxor %xmm11, %xmm11 - movq %rdi, %rax - movq %rdi, %rcx - andq $4095, %rcx -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ - cmpq $4047, %rcx -/* We cannot unify this branching as it would be ~6 cycles slower. */ + pxor %xmm0, %xmm0 + mov %edi, %ecx + and $4095, %ecx + cmp $4032, %ecx ja L(cross_page) - + movdqu (%rdi), %xmm4 + pcmpeqb %xmm0, %xmm4 + pmovmskb %xmm4, %ecx #ifdef AS_STRNLEN -/* Test if end is among first 64 bytes. */ -# define STRNLEN_PROLOG \ - mov %r11, %rsi; \ - subq %rax, %rsi; \ - andq $-64, %rax; \ - testq $-64, %rsi; \ - je L(strnlen_ret) + or %dx, %cx #else -# define STRNLEN_PROLOG andq $-64, %rax; + test %ecx, %ecx #endif - -/* Ignore bits in mask that come before start of string. */ -#define PROLOG(lab) \ - movq %rdi, %rcx; \ - xorq %rax, %rcx; \ - STRNLEN_PROLOG; \ - sarq %cl, %rdx; \ - test %rdx, %rdx; \ - je L(lab); \ - bsfq %rdx, %rax; \ + je L(next48_bytes) + bsf %ecx, %eax ret #ifdef AS_STRNLEN - andq $-16, %rax - FIND_ZERO -#else - /* Test first 16 bytes unaligned. */ - movdqu (%rax), %xmm12 - pcmpeqb %xmm8, %xmm12 - pmovmskb %xmm12, %edx - test %edx, %edx - je L(next48_bytes) - bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ +L(return_zero): + xor %eax, %eax ret - +L(return_noread): + add $64, %rax + sub %rdi, %rax + ret +#endif + .p2align 4 L(next48_bytes): -/* Same as FIND_ZERO except we do not check first 16 bytes. */ - andq $-16, %rax - pcmpeqb 16(%rax), %xmm9 - pcmpeqb 32(%rax), %xmm10 - pcmpeqb 48(%rax), %xmm11 - pmovmskb %xmm9, %edx - pmovmskb %xmm10, %r8d - pmovmskb %xmm11, %ecx - salq $16, %rdx - salq $16, %rcx - orq %r8, %rcx + movdqu 16(%rdi), %xmm1 + movdqu 32(%rdi), %xmm2 + movdqu 48(%rdi), %xmm3 + pcmpeqb %xmm0, %xmm1 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm0, %xmm3 +#ifdef AS_STRNLEN + pmovmskb %xmm1, %ecx + sal $16, %ecx + or %rcx, %rdx +#else + pmovmskb %xmm1, %edx + sal $16, %edx +#endif + pmovmskb %xmm2, %esi + pmovmskb %xmm3, %ecx + sal $16, %ecx + or %esi, %ecx salq $32, %rcx orq %rcx, %rdx -#endif - - /* When no zero byte is found xmm9-11 are zero so we do not have to - zero them. */ - PROLOG(loop) + je L(loop_init) + bsfq %rdx, %rax + ret .p2align 4 L(cross_page): - andq $-64, %rax - FIND_ZERO - PROLOG(loop_init) + movq %rdi, %rax + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 #ifdef AS_STRNLEN -/* We must do this check to correctly handle strnlen (s, -1). */ -L(strnlen_ret): - bts %rsi, %rdx + mov %rdx, %r9 +#endif + andq $-64, %rax + pcmpeqb (%rax), %xmm0 + pcmpeqb 16(%rax), %xmm1 + pcmpeqb 32(%rax), %xmm2 + pcmpeqb 48(%rax), %xmm3 + pmovmskb %xmm0, %esi + pxor %xmm0, %xmm0 + pmovmskb %xmm1, %edx + pmovmskb %xmm2, %r10d + pmovmskb %xmm3, %ecx + sal $16, %edx + sal $16, %ecx + or %esi, %edx + or %r10, %rcx + salq $32, %rcx + orq %rcx, %rdx + mov %edi, %ecx +#ifdef AS_STRNLEN + salq %cl, %r9 + or %r9, %rdx +#endif sarq %cl, %rdx test %rdx, %rdx je L(loop_init) bsfq %rdx, %rax ret -#endif .p2align 4 L(loop_init): - pxor %xmm9, %xmm9 - pxor %xmm10, %xmm10 - pxor %xmm11, %xmm11 + movq %rdi, %rax + andq $-64, %rax #ifdef AS_STRNLEN + add %rdi, %r8 + sub %rax, %r8 + cmp $64, %r8 + je L(return_noread) +#endif + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 +#ifdef USE_AVX2 + vpxor %xmm0, %xmm0, %xmm0 +#endif .p2align 4 L(loop): +#ifdef USE_AVX2 + vmovdqa 64(%rax), %ymm1 + vpminub 96(%rax), %ymm1, %ymm2 + vpcmpeqb %ymm0, %ymm2, %ymm2 + vpmovmskb %ymm2, %edx +#else + movdqa 64(%rax), %xmm5 + pminub 80(%rax), %xmm5 + pminub 96(%rax), %xmm5 + pminub 112(%rax), %xmm5 + pcmpeqb %xmm0, %xmm5 + pmovmskb %xmm5, %edx +#endif - addq $64, %rax - cmpq %rax, %r10 - je L(exit_end) - - movdqa (%rax), %xmm8 - pminub 16(%rax), %xmm8 - pminub 32(%rax), %xmm8 - pminub 48(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx +#ifdef AS_STRNLEN + sub $64, %r8 testl %edx, %edx - jne L(exit) - jmp L(loop) - - .p2align 4 -L(exit_end): - cmp %rax, %r11 - je L(first) /* Do not read when end is at page boundary. */ - pxor %xmm8, %xmm8 - FIND_ZERO - -L(first): - bts %r11, %rdx - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - ret - - .p2align 4 -L(exit): - pxor %xmm8, %xmm8 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - ret - + jne L(exit64) + cmp $64, %r8 + jbe L(exit64_zero) #else - - /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ - .p2align 4 -L(loop): - - movdqa 64(%rax), %xmm8 - pminub 80(%rax), %xmm8 - pminub 96(%rax), %xmm8 - pminub 112(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx testl %edx, %edx jne L(exit64) +#endif subq $-128, %rax - - movdqa (%rax), %xmm8 - pminub 16(%rax), %xmm8 - pminub 32(%rax), %xmm8 - pminub 48(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx +#ifdef USE_AVX2 + vmovdqa (%rax), %ymm1 + vpminub 32(%rax), %ymm1, %ymm2 + vpcmpeqb %ymm0, %ymm2, %ymm2 + vpmovmskb %ymm2, %edx +#else + movdqa (%rax), %xmm5 + pminub 16(%rax), %xmm5 + pminub 32(%rax), %xmm5 + pminub 48(%rax), %xmm5 + pcmpeqb %xmm0, %xmm5 + pmovmskb %xmm5, %edx +#endif +#ifdef AS_STRNLEN + sub $64, %r8 testl %edx, %edx jne L(exit0) + cmp $64, %r8 + jbe L(exit0_zero) +#else + testl %edx, %edx + jne L(exit0) +#endif jmp L(loop) +#ifdef AS_STRNLEN + .p2align 4 +L(exit64_zero): + addq $64, %rax +L(exit0_zero): + add %r8, %rax + sub %rdi, %rax + ret +#endif .p2align 4 + + L(exit64): addq $64, %rax L(exit0): - pxor %xmm8, %xmm8 - FIND_ZERO - +#ifdef USE_AVX2 + sal $32, %rdx +#else + sal $48, %rdx +#endif +#ifdef AS_STRNLEN + cmp $64, %r8 + jae L(dont_set2) + bts %r8, %rdx + L(dont_set2): +#endif +#ifdef USE_AVX2 + subq %rdi, %rax + vpcmpeqb %ymm0, %ymm1, %ymm1 + vpmovmskb %ymm1, %ecx + vzeroupper + or %rcx, %rdx +#else + pcmpeqb (%rax), %xmm0 + pcmpeqb 16(%rax), %xmm1 + pcmpeqb 32(%rax), %xmm2 + subq %rdi, %rax + pmovmskb %xmm0, %esi + pmovmskb %xmm1, %ecx + pmovmskb %xmm2, %r8d + sal $16, %ecx + or %esi, %ecx + salq $32, %r8 + orq %r8, %rcx + orq %rcx, %rdx +#endif bsfq %rdx, %rdx addq %rdx, %rax - subq %rdi, %rax ret - -#endif - END(strlen) libc_hidden_builtin_def (strlen)