From patchwork Sat Jun 20 08:35:25 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ondrej Bilka X-Patchwork-Id: 7266 Received: (qmail 89000 invoked by alias); 20 Jun 2015 08:35:42 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 88981 invoked by uid 89); 20 Jun 2015 08:35:41 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=0.7 required=5.0 tests=AWL, BAYES_50, FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2 X-HELO: popelka.ms.mff.cuni.cz Date: Sat, 20 Jun 2015 10:35:25 +0200 From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= To: libc-alpha@sourceware.org Subject: [PATCH neleai/string-x64] Microoptimize strcmp-sse2-unaligned. Message-ID: <20150620083525.GA31992@domone> MIME-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.5.20 (2009-06-14) Hi, When I read strcmp again to improve strncmp and add avx2 strcmp I found that I made several mistakes, mainly caused by first optimizing c template and then fixing assembly. First was mainly my idea to simplify handling cross-page check by oring src and dest. I recall that I first did complex crosspage handling where false positives were cheap. Then I found that due to size it has big overhead and simple loop was faster when testing with firefox. That turned original decision into bad one. Second is to reorganize loop instructions so that after loop ends I could simply find last byte without recalculating much, using trick that last 16 bit mask could be ored with previous three as its relevant only when previous three were zero. Final one is that gcc generates bad loops in regards where to increment pointers. You should place them after loads that use them, not at start of loop like gcc does. That change is responsible for 10% improvement for large sizes. Final are microoptimizations that save few bytes without measurable performance impact like using eax instead rax to save byte or moving unnecessary zeroing instruction when they are not needed. Profile data are here, shortly with avx2 for haswell that I will submit next. http://kam.mff.cuni.cz/~ondra/benchmark_string/strcmp_profile.html OK to commit this? * sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S (__strcmp_sse2_unaligned): Add several microoptimizations. diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S index 20b65fa..03d1b11 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S @@ -19,10 +19,13 @@ #include "sysdep.h" ENTRY ( __strcmp_sse2_unaligned) - movl %edi, %eax - xorl %edx, %edx pxor %xmm7, %xmm7 - orl %esi, %eax + movl %esi, %eax + andl $4095, %eax + cmpl $4032, %eax + jg L(cross_page) + + movl %edi, %eax andl $4095, %eax cmpl $4032, %eax jg L(cross_page) @@ -30,13 +33,11 @@ ENTRY ( __strcmp_sse2_unaligned) movdqu (%rsi), %xmm0 pcmpeqb %xmm1, %xmm0 pminub %xmm1, %xmm0 - pxor %xmm1, %xmm1 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - testq %rax, %rax + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax je L(next_48_bytes) -L(return): - bsfq %rax, %rdx + bsf %eax, %edx movzbl (%rdi, %rdx), %eax movzbl (%rsi, %rdx), %edx subl %edx, %eax @@ -50,29 +51,35 @@ L(next_48_bytes): pcmpeqb %xmm6, %xmm3 movdqu 32(%rsi), %xmm2 pminub %xmm6, %xmm3 - pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm7, %xmm3 movdqu 48(%rdi), %xmm4 pcmpeqb %xmm5, %xmm2 - pmovmskb %xmm3, %edx + pmovmskb %xmm3, %edx movdqu 48(%rsi), %xmm0 pminub %xmm5, %xmm2 - pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm7, %xmm2 pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm2, %eax - salq $16, %rdx + pmovmskb %xmm2, %eax + sal $16, %edx pminub %xmm4, %xmm0 - pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm7, %xmm0 salq $32, %rax orq %rdx, %rax - pmovmskb %xmm0, %ecx - movq %rcx, %rdx - salq $48, %rdx - orq %rdx, %rax - jne L(return) + pmovmskb %xmm0, %ecx + salq $48, %rcx + orq %rcx, %rax + je L(main_loop_header) +L(return): + bsf %rax, %rdx + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax + ret + + L(main_loop_header): leaq 64(%rdi), %rdx movl $4096, %ecx - pxor %xmm9, %xmm9 andq $-64, %rdx subq %rdi, %rdx leaq (%rdi, %rdx), %rax @@ -82,16 +89,11 @@ L(main_loop_header): subq %rsi, %rcx shrq $6, %rcx movq %rcx, %rsi - jmp L(loop_start) .p2align 4 L(loop): - addq $64, %rax - addq $64, %rdx -L(loop_start): - testq %rsi, %rsi - leaq -1(%rsi), %rsi - je L(loop_cross_page) + add $-1, %rsi + ja L(loop_cross_page) L(back_to_loop): movdqu (%rdx), %xmm0 movdqu 16(%rdx), %xmm1 @@ -104,61 +106,57 @@ L(back_to_loop): movdqu 48(%rdx), %xmm6 pminub %xmm3, %xmm1 movdqa 32(%rax), %xmm2 - pminub %xmm1, %xmm0 movdqa 48(%rax), %xmm3 pcmpeqb %xmm2, %xmm5 pcmpeqb %xmm3, %xmm6 + addq $64, %rax pminub %xmm2, %xmm5 pminub %xmm3, %xmm6 - pminub %xmm5, %xmm0 - pminub %xmm6, %xmm0 - pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm0, %ecx + addq $64, %rdx + pminub %xmm5, %xmm6 + pminub %xmm1, %xmm6 + pminub %xmm0, %xmm6 + pcmpeqb %xmm7, %xmm6 + pmovmskb %xmm6, %ecx testl %ecx, %ecx je L(loop) - pcmpeqb %xmm7, %xmm5 - movdqu (%rdx), %xmm0 - pcmpeqb %xmm7, %xmm1 - movdqa (%rax), %xmm2 - pcmpeqb %xmm2, %xmm0 - pminub %xmm2, %xmm0 - pcmpeqb %xmm7, %xmm6 pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm1, %ecx - pmovmskb %xmm5, %r8d - pmovmskb %xmm0, %edi - salq $16, %rcx + pcmpeqb %xmm7, %xmm1 + pcmpeqb %xmm7, %xmm5 + pmovmskb %xmm0, %edi + pmovmskb %xmm1, %esi + pmovmskb %xmm5, %r8d + salq $48, %rcx salq $32, %r8 - pmovmskb %xmm6, %esi orq %r8, %rcx orq %rdi, %rcx - salq $48, %rsi + sal $16, %esi orq %rsi, %rcx bsfq %rcx, %rcx - movzbl (%rax, %rcx), %eax - movzbl (%rdx, %rcx), %edx + movzbl -64(%rax, %rcx), %eax + movzbl -64(%rdx, %rcx), %edx subl %edx, %eax ret .p2align 4 L(loop_cross_page): - xor %r10, %r10 + xor %ecx, %ecx movq %rdx, %r9 and $63, %r9 - subq %r9, %r10 + subq %r9, %rcx - movdqa (%rdx, %r10), %xmm0 - movdqa 16(%rdx, %r10), %xmm1 - movdqu (%rax, %r10), %xmm2 - movdqu 16(%rax, %r10), %xmm3 + movdqa (%rdx, %rcx), %xmm0 + movdqa 16(%rdx, %rcx), %xmm1 + movdqu (%rax, %rcx), %xmm2 + movdqu 16(%rax, %rcx), %xmm3 pcmpeqb %xmm2, %xmm0 - movdqa 32(%rdx, %r10), %xmm5 + movdqa 32(%rdx, %rcx), %xmm5 pcmpeqb %xmm3, %xmm1 pminub %xmm2, %xmm0 - movdqa 48(%rdx, %r10), %xmm6 + movdqa 48(%rdx, %rcx), %xmm6 pminub %xmm3, %xmm1 - movdqu 32(%rax, %r10), %xmm2 - movdqu 48(%rax, %r10), %xmm3 + movdqu 32(%rax, %rcx), %xmm2 + movdqu 48(%rax, %rcx), %xmm3 pcmpeqb %xmm2, %xmm5 pcmpeqb %xmm3, %xmm6 pminub %xmm2, %xmm5 @@ -169,12 +167,12 @@ L(loop_cross_page): pcmpeqb %xmm7, %xmm5 pcmpeqb %xmm7, %xmm6 - pmovmskb %xmm1, %ecx - pmovmskb %xmm5, %r8d - pmovmskb %xmm0, %edi - salq $16, %rcx + pmovmskb %xmm1, %ecx + pmovmskb %xmm5, %r8d + pmovmskb %xmm0, %edi + sal $16, %ecx salq $32, %r8 - pmovmskb %xmm6, %esi + pmovmskb %xmm6, %esi orq %r8, %rdi orq %rcx, %rdi salq $48, %rsi @@ -190,20 +188,21 @@ L(loop_cross_page): subl %edx, %eax ret +L(cross_page): + xorl %edx, %edx + jmp L(cross_page_loop_start) .p2align 4 L(cross_page_loop): - cmpb %cl, %al - jne L(different) - addq $1, %rdx - cmpq $64, %rdx + add $1, %edx + cmp $64, %edx je L(main_loop_header) -L(cross_page): +L(cross_page_loop_start): movzbl (%rdi, %rdx), %eax movzbl (%rsi, %rdx), %ecx - testb %al, %al + subl %ecx, %eax + jne L(different) + test %ecx, %ecx jne L(cross_page_loop) - xorl %eax, %eax L(different): - subl %ecx, %eax ret END (__strcmp_sse2_unaligned)