From patchwork Sun Jun 7 20:52:44 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ondrej Bilka X-Patchwork-Id: 7067 Received: (qmail 13933 invoked by alias); 7 Jun 2015 20:52:59 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 13923 invoked by uid 89); 7 Jun 2015 20:52:58 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=0.7 required=5.0 tests=AWL, BAYES_50, FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2 X-HELO: popelka.ms.mff.cuni.cz Date: Sun, 7 Jun 2015 22:52:44 +0200 From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= To: libc-alpha@sourceware.org Subject: [PATCH] x86-64: New memchr implementation. Message-ID: <20150607205244.GA6997@domone> MIME-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.5.20 (2009-06-14) Hi, I decided to also improve memchr which I didn't do before as it was relatively rarely called. I used same technique as strchr to get around 10% speedup and considerable decrease of its size. I use fact that memory area needs to be valid. That rules out values of n in range -64...-1 where it could stop early instead browsing entire memory. I could handle these with additional check if you want. Also there is possible optimization to use that bsf sets zero flag to save two tests, is that worth it? * sysdeps/x86_64/memchr.S (memchr): Improve implementation. diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index fae85ca..9649b1c 100644 --- a/sysdeps/x86_64/memchr.S +++ b/sysdeps/x86_64/memchr.S @@ -1,5 +1,4 @@ -/* Copyright (C) 2011-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. +/* Copyright (C) 2015 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,292 +17,134 @@ #include -/* fast SSE2 version with using pmaxub and 64 byte loop */ +/* fast SSE2 version with using 64 byte loop */ .text ENTRY(memchr) - movd %rsi, %xmm1 - mov %rdi, %rcx - - punpcklbw %xmm1, %xmm1 - test %rdx, %rdx - jz L(return_null) - punpcklbw %xmm1, %xmm1 - - and $63, %rcx - pshufd $0, %xmm1, %xmm1 - - cmp $48, %rcx - ja L(crosscache) - - movdqu (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - - jnz L(matches_1) - sub $16, %rdx - jbe L(return_null) - add $16, %rdi - and $15, %rcx - and $-16, %rdi - add %rcx, %rdx - sub $64, %rdx - jbe L(exit_loop) - jmp L(loop_prolog) - - .p2align 4 -L(crosscache): - and $15, %rcx - and $-16, %rdi - movdqa (%rdi), %xmm0 - - pcmpeqb %xmm1, %xmm0 -/* Check if there is a match. */ - pmovmskb %xmm0, %eax -/* Remove the leading bytes. */ - sar %cl, %eax - test %eax, %eax - je L(unaligned_no_match) -/* Check which byte is a match. */ + movd %esi, %xmm2 + testq %rdx, %rdx + punpcklbw %xmm2, %xmm2 + punpcklwd %xmm2, %xmm2 + pshufd $0, %xmm2, %xmm2 + je L(return_null) + movl %edi, %eax + andl $4095, %eax + cmpl $4032, %eax + jg L(cross_page) + movdqu (%rdi), %xmm1 + pcmpeqb %xmm2, %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + je L(next_48_bytes) bsf %eax, %eax - - sub %rax, %rdx + cmpq %rax, %rdx jbe L(return_null) - add %rdi, %rax - add %rcx, %rax - ret - - .p2align 4 -L(unaligned_no_match): - add %rcx, %rdx - sub $16, %rdx + addq %rdi, %rax + ret +.p2align 4,,10 +.p2align 3 +L(next_48_bytes): + movdqu 16(%rdi), %xmm1 + movdqu 32(%rdi), %xmm3 + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm2, %xmm3 + movdqu 48(%rdi), %xmm4 + pmovmskb %xmm1, %esi + pmovmskb %xmm3, %ecx + pcmpeqb %xmm2, %xmm4 + pmovmskb %xmm4, %eax + salq $32, %rcx + sal $16, %esi + orq %rsi, %rcx + salq $48, %rax + orq %rcx, %rax + je L(prepare_loop) +L(return): + bsf %rax, %rax + cmpq %rax, %rdx jbe L(return_null) - add $16, %rdi - sub $64, %rdx - jbe L(exit_loop) - - .p2align 4 -L(loop_prolog): - movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) + addq %rdi, %rax + ret - movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm4 - pcmpeqb %xmm1, %xmm4 - add $64, %rdi - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches0) - - test $0x3f, %rdi - jz L(align64_loop) - - sub $64, %rdx - jbe L(exit_loop) - - movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - - add $64, %rdi - test %eax, %eax - jnz L(matches0) - - mov %rdi, %rcx - and $-64, %rdi - and $63, %rcx - add %rcx, %rdx - - .p2align 4 -L(align64_loop): - sub $64, %rdx - jbe L(exit_loop) - movdqa (%rdi), %xmm0 - movdqa 16(%rdi), %xmm2 - movdqa 32(%rdi), %xmm3 - movdqa 48(%rdi), %xmm4 - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm1, %xmm2 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm1, %xmm4 - - pmaxub %xmm0, %xmm3 - pmaxub %xmm2, %xmm4 +.p2align 4,,10 +.p2align 3 +L(return_null): + xorl %eax, %eax + ret +.p2align 4,,10 +.p2align 4 +L(prepare_loop): + movq %rdi, %rcx + andq $-64, %rcx + subq %rcx, %rdi + leaq (%rdx, %rdi), %rsi +.p2align 4,,10 +.p2align 3 +L(loop): + subq $64, %rsi + jbe L(return_null) + + movdqa 64(%rcx), %xmm0 + movdqa 80(%rcx), %xmm1 + movdqa 96(%rcx), %xmm3 + movdqa 112(%rcx), %xmm4 + + pcmpeqb %xmm2, %xmm0 + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm2, %xmm3 + pcmpeqb %xmm2, %xmm4 + + pmaxub %xmm0, %xmm1 + pmaxub %xmm1, %xmm3 pmaxub %xmm3, %xmm4 - pmovmskb %xmm4, %eax - - add $64, %rdi - - test %eax, %eax - jz L(align64_loop) - - sub $64, %rdi - + addq $64, %rcx + pmovmskb %xmm4, %edx + testl %edx, %edx + je L(loop) + pmovmskb %xmm3, %r8d + pmovmskb %xmm1, %edi + salq $48, %rdx pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - - pcmpeqb 48(%rdi), %xmm1 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - pmovmskb %xmm1, %eax - bsf %eax, %eax - lea 48(%rdi, %rax), %rax - ret - - .p2align 4 -L(exit_loop): - add $32, %rdx - jle L(exit_loop_32) - - movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32_1) - sub $16, %rdx - jle L(return_null) - - pcmpeqb 48(%rdi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches48_1) - xor %rax, %rax - ret - - .p2align 4 -L(exit_loop_32): - add $32, %rdx - movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches_1) - sub $16, %rdx - jbe L(return_null) - - pcmpeqb 16(%rdi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches16_1) - xor %rax, %rax - ret - - .p2align 4 -L(matches0): - bsf %eax, %eax - lea -16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches): - bsf %eax, %eax - add %rdi, %rax - ret - - .p2align 4 -L(matches16): - bsf %eax, %eax - lea 16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches32): - bsf %eax, %eax - lea 32(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches_1): - bsf %eax, %eax - sub %rax, %rdx - jbe L(return_null) - add %rdi, %rax - ret - - .p2align 4 -L(matches16_1): - bsf %eax, %eax - sub %rax, %rdx + salq $32, %r8 + sal $16, %edi + or %edi, %eax + orq %r8, %rax + orq %rax, %rdx + bsfq %rdx, %rax + cmp %rax, %rsi jbe L(return_null) - lea 16(%rdi, %rax), %rax + addq %rcx, %rax ret - .p2align 4 -L(matches32_1): - bsf %eax, %eax - sub %rax, %rdx - jbe L(return_null) - lea 32(%rdi, %rax), %rax - ret - - .p2align 4 -L(matches48_1): - bsf %eax, %eax - sub %rax, %rdx - jbe L(return_null) - lea 48(%rdi, %rax), %rax - ret - - .p2align 4 -L(return_null): - xor %rax, %rax - ret +.p2align 4,,10 +.p2align 3 +L(cross_page): + movq %rdi, %rsi + andq $-64, %rsi + movdqa (%rsi), %xmm1 + pcmpeqb %xmm2, %xmm1 + pmovmskb %xmm1, %ecx + movdqa 16(%rsi), %xmm1 + pcmpeqb %xmm2, %xmm1 + pmovmskb %xmm1, %eax + movdqa 32(%rsi), %xmm1 + pcmpeqb %xmm2, %xmm1 + sal $16, %eax + movdqa %xmm2, %xmm0 + pcmpeqb 48(%rsi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + salq $32, %r8 + salq $48, %r9 + or %ecx, %eax + orq %r9, %rax + orq %r8, %rax + movq %rdi, %rcx + subq %rsi, %rcx + shrq %cl, %rax + testq %rax, %rax + jne L(return) + jmp L(prepare_loop) END(memchr) strong_alias (memchr, __memchr)