x86-64: New memchr implementation.
Commit Message
Hi,
I decided to also improve memchr which I didn't do before as it was
relatively rarely called. I used same technique as strchr to get around 10% speedup and considerable decrease of its size.
I use fact that memory area needs to be valid. That rules out values of
n in range -64...-1 where it could stop early instead browsing entire
memory. I could handle these with additional check if you want.
Also there is possible optimization to use that bsf sets zero flag to
save two tests, is that worth it?
* sysdeps/x86_64/memchr.S (memchr): Improve implementation.
Comments
ping
On Sun, Jun 07, 2015 at 10:52:44PM +0200, Ondřej Bílka wrote:
> Hi,
>
> I decided to also improve memchr which I didn't do before as it was
> relatively rarely called. I used same technique as strchr to get around 10% speedup and considerable decrease of its size.
>
> I use fact that memory area needs to be valid. That rules out values of
> n in range -64...-1 where it could stop early instead browsing entire
> memory. I could handle these with additional check if you want.
>
> Also there is possible optimization to use that bsf sets zero flag to
> save two tests, is that worth it?
>
> * sysdeps/x86_64/memchr.S (memchr): Improve implementation.
>
> diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
> index fae85ca..9649b1c 100644
> --- a/sysdeps/x86_64/memchr.S
> +++ b/sysdeps/x86_64/memchr.S
> @@ -1,5 +1,4 @@
> -/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
> - Contributed by Intel Corporation.
> +/* Copyright (C) 2015 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> The GNU C Library is free software; you can redistribute it and/or
> @@ -18,292 +17,134 @@
>
> #include <sysdep.h>
>
> -/* fast SSE2 version with using pmaxub and 64 byte loop */
> +/* fast SSE2 version with using 64 byte loop */
>
> .text
> ENTRY(memchr)
> - movd %rsi, %xmm1
> - mov %rdi, %rcx
> -
> - punpcklbw %xmm1, %xmm1
> - test %rdx, %rdx
> - jz L(return_null)
> - punpcklbw %xmm1, %xmm1
> -
> - and $63, %rcx
> - pshufd $0, %xmm1, %xmm1
> -
> - cmp $48, %rcx
> - ja L(crosscache)
> -
> - movdqu (%rdi), %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %eax
> - test %eax, %eax
> -
> - jnz L(matches_1)
> - sub $16, %rdx
> - jbe L(return_null)
> - add $16, %rdi
> - and $15, %rcx
> - and $-16, %rdi
> - add %rcx, %rdx
> - sub $64, %rdx
> - jbe L(exit_loop)
> - jmp L(loop_prolog)
> -
> - .p2align 4
> -L(crosscache):
> - and $15, %rcx
> - and $-16, %rdi
> - movdqa (%rdi), %xmm0
> -
> - pcmpeqb %xmm1, %xmm0
> -/* Check if there is a match. */
> - pmovmskb %xmm0, %eax
> -/* Remove the leading bytes. */
> - sar %cl, %eax
> - test %eax, %eax
> - je L(unaligned_no_match)
> -/* Check which byte is a match. */
> + movd %esi, %xmm2
> + testq %rdx, %rdx
> + punpcklbw %xmm2, %xmm2
> + punpcklwd %xmm2, %xmm2
> + pshufd $0, %xmm2, %xmm2
> + je L(return_null)
> + movl %edi, %eax
> + andl $4095, %eax
> + cmpl $4032, %eax
> + jg L(cross_page)
> + movdqu (%rdi), %xmm1
> + pcmpeqb %xmm2, %xmm1
> + pmovmskb %xmm1, %eax
> + test %eax, %eax
> + je L(next_48_bytes)
> bsf %eax, %eax
> -
> - sub %rax, %rdx
> + cmpq %rax, %rdx
> jbe L(return_null)
> - add %rdi, %rax
> - add %rcx, %rax
> - ret
> -
> - .p2align 4
> -L(unaligned_no_match):
> - add %rcx, %rdx
> - sub $16, %rdx
> + addq %rdi, %rax
> + ret
> +.p2align 4,,10
> +.p2align 3
> +L(next_48_bytes):
> + movdqu 16(%rdi), %xmm1
> + movdqu 32(%rdi), %xmm3
> + pcmpeqb %xmm2, %xmm1
> + pcmpeqb %xmm2, %xmm3
> + movdqu 48(%rdi), %xmm4
> + pmovmskb %xmm1, %esi
> + pmovmskb %xmm3, %ecx
> + pcmpeqb %xmm2, %xmm4
> + pmovmskb %xmm4, %eax
> + salq $32, %rcx
> + sal $16, %esi
> + orq %rsi, %rcx
> + salq $48, %rax
> + orq %rcx, %rax
> + je L(prepare_loop)
> +L(return):
> + bsf %rax, %rax
> + cmpq %rax, %rdx
> jbe L(return_null)
> - add $16, %rdi
> - sub $64, %rdx
> - jbe L(exit_loop)
> -
> - .p2align 4
> -L(loop_prolog):
> - movdqa (%rdi), %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %eax
> - test %eax, %eax
> - jnz L(matches)
> + addq %rdi, %rax
> + ret
>
> - movdqa 16(%rdi), %xmm2
> - pcmpeqb %xmm1, %xmm2
> - pmovmskb %xmm2, %eax
> - test %eax, %eax
> - jnz L(matches16)
> -
> - movdqa 32(%rdi), %xmm3
> - pcmpeqb %xmm1, %xmm3
> - pmovmskb %xmm3, %eax
> - test %eax, %eax
> - jnz L(matches32)
> -
> - movdqa 48(%rdi), %xmm4
> - pcmpeqb %xmm1, %xmm4
> - add $64, %rdi
> - pmovmskb %xmm4, %eax
> - test %eax, %eax
> - jnz L(matches0)
> -
> - test $0x3f, %rdi
> - jz L(align64_loop)
> -
> - sub $64, %rdx
> - jbe L(exit_loop)
> -
> - movdqa (%rdi), %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %eax
> - test %eax, %eax
> - jnz L(matches)
> -
> - movdqa 16(%rdi), %xmm2
> - pcmpeqb %xmm1, %xmm2
> - pmovmskb %xmm2, %eax
> - test %eax, %eax
> - jnz L(matches16)
> -
> - movdqa 32(%rdi), %xmm3
> - pcmpeqb %xmm1, %xmm3
> - pmovmskb %xmm3, %eax
> - test %eax, %eax
> - jnz L(matches32)
> -
> - movdqa 48(%rdi), %xmm3
> - pcmpeqb %xmm1, %xmm3
> - pmovmskb %xmm3, %eax
> -
> - add $64, %rdi
> - test %eax, %eax
> - jnz L(matches0)
> -
> - mov %rdi, %rcx
> - and $-64, %rdi
> - and $63, %rcx
> - add %rcx, %rdx
> -
> - .p2align 4
> -L(align64_loop):
> - sub $64, %rdx
> - jbe L(exit_loop)
> - movdqa (%rdi), %xmm0
> - movdqa 16(%rdi), %xmm2
> - movdqa 32(%rdi), %xmm3
> - movdqa 48(%rdi), %xmm4
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm1, %xmm2
> - pcmpeqb %xmm1, %xmm3
> - pcmpeqb %xmm1, %xmm4
> -
> - pmaxub %xmm0, %xmm3
> - pmaxub %xmm2, %xmm4
> +.p2align 4,,10
> +.p2align 3
> +L(return_null):
> + xorl %eax, %eax
> + ret
> +.p2align 4,,10
> +.p2align 4
> +L(prepare_loop):
> + movq %rdi, %rcx
> + andq $-64, %rcx
> + subq %rcx, %rdi
> + leaq (%rdx, %rdi), %rsi
> +.p2align 4,,10
> +.p2align 3
> +L(loop):
> + subq $64, %rsi
> + jbe L(return_null)
> +
> + movdqa 64(%rcx), %xmm0
> + movdqa 80(%rcx), %xmm1
> + movdqa 96(%rcx), %xmm3
> + movdqa 112(%rcx), %xmm4
> +
> + pcmpeqb %xmm2, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + pcmpeqb %xmm2, %xmm3
> + pcmpeqb %xmm2, %xmm4
> +
> + pmaxub %xmm0, %xmm1
> + pmaxub %xmm1, %xmm3
> pmaxub %xmm3, %xmm4
> - pmovmskb %xmm4, %eax
> -
> - add $64, %rdi
> -
> - test %eax, %eax
> - jz L(align64_loop)
> -
> - sub $64, %rdi
> -
> + addq $64, %rcx
> + pmovmskb %xmm4, %edx
> + testl %edx, %edx
> + je L(loop)
> + pmovmskb %xmm3, %r8d
> + pmovmskb %xmm1, %edi
> + salq $48, %rdx
> pmovmskb %xmm0, %eax
> - test %eax, %eax
> - jnz L(matches)
> -
> - pmovmskb %xmm2, %eax
> - test %eax, %eax
> - jnz L(matches16)
> -
> - movdqa 32(%rdi), %xmm3
> - pcmpeqb %xmm1, %xmm3
> -
> - pcmpeqb 48(%rdi), %xmm1
> - pmovmskb %xmm3, %eax
> - test %eax, %eax
> - jnz L(matches32)
> -
> - pmovmskb %xmm1, %eax
> - bsf %eax, %eax
> - lea 48(%rdi, %rax), %rax
> - ret
> -
> - .p2align 4
> -L(exit_loop):
> - add $32, %rdx
> - jle L(exit_loop_32)
> -
> - movdqa (%rdi), %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %eax
> - test %eax, %eax
> - jnz L(matches)
> -
> - movdqa 16(%rdi), %xmm2
> - pcmpeqb %xmm1, %xmm2
> - pmovmskb %xmm2, %eax
> - test %eax, %eax
> - jnz L(matches16)
> -
> - movdqa 32(%rdi), %xmm3
> - pcmpeqb %xmm1, %xmm3
> - pmovmskb %xmm3, %eax
> - test %eax, %eax
> - jnz L(matches32_1)
> - sub $16, %rdx
> - jle L(return_null)
> -
> - pcmpeqb 48(%rdi), %xmm1
> - pmovmskb %xmm1, %eax
> - test %eax, %eax
> - jnz L(matches48_1)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(exit_loop_32):
> - add $32, %rdx
> - movdqa (%rdi), %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %eax
> - test %eax, %eax
> - jnz L(matches_1)
> - sub $16, %rdx
> - jbe L(return_null)
> -
> - pcmpeqb 16(%rdi), %xmm1
> - pmovmskb %xmm1, %eax
> - test %eax, %eax
> - jnz L(matches16_1)
> - xor %rax, %rax
> - ret
> -
> - .p2align 4
> -L(matches0):
> - bsf %eax, %eax
> - lea -16(%rax, %rdi), %rax
> - ret
> -
> - .p2align 4
> -L(matches):
> - bsf %eax, %eax
> - add %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(matches16):
> - bsf %eax, %eax
> - lea 16(%rax, %rdi), %rax
> - ret
> -
> - .p2align 4
> -L(matches32):
> - bsf %eax, %eax
> - lea 32(%rax, %rdi), %rax
> - ret
> -
> - .p2align 4
> -L(matches_1):
> - bsf %eax, %eax
> - sub %rax, %rdx
> - jbe L(return_null)
> - add %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(matches16_1):
> - bsf %eax, %eax
> - sub %rax, %rdx
> + salq $32, %r8
> + sal $16, %edi
> + or %edi, %eax
> + orq %r8, %rax
> + orq %rax, %rdx
> + bsfq %rdx, %rax
> + cmp %rax, %rsi
> jbe L(return_null)
> - lea 16(%rdi, %rax), %rax
> + addq %rcx, %rax
> ret
>
> - .p2align 4
> -L(matches32_1):
> - bsf %eax, %eax
> - sub %rax, %rdx
> - jbe L(return_null)
> - lea 32(%rdi, %rax), %rax
> - ret
> -
> - .p2align 4
> -L(matches48_1):
> - bsf %eax, %eax
> - sub %rax, %rdx
> - jbe L(return_null)
> - lea 48(%rdi, %rax), %rax
> - ret
> -
> - .p2align 4
> -L(return_null):
> - xor %rax, %rax
> - ret
> +.p2align 4,,10
> +.p2align 3
> +L(cross_page):
> + movq %rdi, %rsi
> + andq $-64, %rsi
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm2, %xmm1
> + pmovmskb %xmm1, %ecx
> + movdqa 16(%rsi), %xmm1
> + pcmpeqb %xmm2, %xmm1
> + pmovmskb %xmm1, %eax
> + movdqa 32(%rsi), %xmm1
> + pcmpeqb %xmm2, %xmm1
> + sal $16, %eax
> + movdqa %xmm2, %xmm0
> + pcmpeqb 48(%rsi), %xmm0
> + pmovmskb %xmm1, %r8d
> + pmovmskb %xmm0, %r9d
> + salq $32, %r8
> + salq $48, %r9
> + or %ecx, %eax
> + orq %r9, %rax
> + orq %r8, %rax
> + movq %rdi, %rcx
> + subq %rsi, %rcx
> + shrq %cl, %rax
> + testq %rax, %rax
> + jne L(return)
> + jmp L(prepare_loop)
> END(memchr)
>
> strong_alias (memchr, __memchr)
>ping
>On Sun, Jun 07, 2015 at 10:52:44PM +0200, Ondřej Bílka wrote:
>> Hi,
>>
>> I decided to also improve memchr which I didn't do before as it was
>> relatively rarely called. I used same technique as strchr to get around 10% speedup and considerable decrease of its size.
>>
>> I use fact that memory area needs to be valid. That rules out values
>> of n in range -64...-1 where it could stop early instead browsing
>> entire memory. I could handle these with additional check if you want.
>>
>> Also there is possible optimization to use that bsf sets zero flag to
>> save two tests, is that worth it?
>>
>> * sysdeps/x86_64/memchr.S (memchr): Improve implementation.
>>
>> diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index
>> fae85ca..9649b1c 100644
>> --- a/sysdeps/x86_64/memchr.S
>> +++ b/sysdeps/x86_64/memchr.S
>> @@ -1,5 +1,4 @@
>> -/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
>> - Contributed by Intel Corporation.
>> +/* Copyright (C) 2015 Free Software Foundation, Inc.
>> This file is part of the GNU C Library.
>>
>> The GNU C Library is free software; you can redistribute it and/or
>> @@ -18,292 +17,134 @@
>>
>> #include <sysdep.h>
>>
>> -/* fast SSE2 version with using pmaxub and 64 byte loop */
>> +/* fast SSE2 version with using 64 byte loop */
>>
>> .text
>> ENTRY(memchr)
>> - movd %rsi, %xmm1
>> - mov %rdi, %rcx
>> -
>> - punpcklbw %xmm1, %xmm1
>> - test %rdx, %rdx
>> - jz L(return_null)
>> - punpcklbw %xmm1, %xmm1
>> -
>> - and $63, %rcx
>> - pshufd $0, %xmm1, %xmm1
>> -
>> - cmp $48, %rcx
>> - ja L(crosscache)
>> -
>> - movdqu (%rdi), %xmm0
>> - pcmpeqb %xmm1, %xmm0
>> - pmovmskb %xmm0, %eax
>> - test %eax, %eax
>> -
>> - jnz L(matches_1)
>> - sub $16, %rdx
>> - jbe L(return_null)
>> - add $16, %rdi
>> - and $15, %rcx
>> - and $-16, %rdi
>> - add %rcx, %rdx
>> - sub $64, %rdx
>> - jbe L(exit_loop)
>> - jmp L(loop_prolog)
>> -
>> - .p2align 4
>> -L(crosscache):
>> - and $15, %rcx
>> - and $-16, %rdi
>> - movdqa (%rdi), %xmm0
>> -
>> - pcmpeqb %xmm1, %xmm0
>> -/* Check if there is a match. */
>> - pmovmskb %xmm0, %eax
>> -/* Remove the leading bytes. */
>> - sar %cl, %eax
>> - test %eax, %eax
>> - je L(unaligned_no_match)
>> -/* Check which byte is a match. */
>> + movd %esi, %xmm2
>> + testq %rdx, %rdx
>> + punpcklbw %xmm2, %xmm2
>> + punpcklwd %xmm2, %xmm2
>> + pshufd $0, %xmm2, %xmm2
>> + je L(return_null)
>> + movl %edi, %eax
>> + andl $4095, %eax
>> + cmpl $4032, %eax
>> + jg L(cross_page)
>> + movdqu (%rdi), %xmm1
>> + pcmpeqb %xmm2, %xmm1
>> + pmovmskb %xmm1, %eax
>> + test %eax, %eax
>> + je L(next_48_bytes)
>> bsf %eax, %eax
>> -
>> - sub %rax, %rdx
>> + cmpq %rax, %rdx
>> jbe L(return_null)
>> - add %rdi, %rax
>> - add %rcx, %rax
>> - ret
>> -
>> - .p2align 4
>> -L(unaligned_no_match):
>> - add %rcx, %rdx
>> - sub $16, %rdx
>> + addq %rdi, %rax
>> + ret
>> +.p2align 4,,10
>> +.p2align 3
>> +L(next_48_bytes):
>> + movdqu 16(%rdi), %xmm1
>> + movdqu 32(%rdi), %xmm3
>> + pcmpeqb %xmm2, %xmm1
>> + pcmpeqb %xmm2, %xmm3
>> + movdqu 48(%rdi), %xmm4
>> + pmovmskb %xmm1, %esi
>> + pmovmskb %xmm3, %ecx
>> + pcmpeqb %xmm2, %xmm4
>> + pmovmskb %xmm4, %eax
>> + salq $32, %rcx
>> + sal $16, %esi
>> + orq %rsi, %rcx
>> + salq $48, %rax
>> + orq %rcx, %rax
>> + je L(prepare_loop)
>> +L(return):
>> + bsf %rax, %rax
>> + cmpq %rax, %rdx
>> jbe L(return_null)
>> - add $16, %rdi
>> - sub $64, %rdx
>> - jbe L(exit_loop)
>> -
>> - .p2align 4
>> -L(loop_prolog):
>> - movdqa (%rdi), %xmm0
>> - pcmpeqb %xmm1, %xmm0
>> - pmovmskb %xmm0, %eax
>> - test %eax, %eax
>> - jnz L(matches)
>> + addq %rdi, %rax
>> + ret
>>
>> - movdqa 16(%rdi), %xmm2
>> - pcmpeqb %xmm1, %xmm2
>> - pmovmskb %xmm2, %eax
>> - test %eax, %eax
>> - jnz L(matches16)
>> -
>> - movdqa 32(%rdi), %xmm3
>> - pcmpeqb %xmm1, %xmm3
>> - pmovmskb %xmm3, %eax
>> - test %eax, %eax
>> - jnz L(matches32)
>> -
>> - movdqa 48(%rdi), %xmm4
>> - pcmpeqb %xmm1, %xmm4
>> - add $64, %rdi
>> - pmovmskb %xmm4, %eax
>> - test %eax, %eax
>> - jnz L(matches0)
>> -
>> - test $0x3f, %rdi
>> - jz L(align64_loop)
>> -
>> - sub $64, %rdx
>> - jbe L(exit_loop)
>> -
>> - movdqa (%rdi), %xmm0
>> - pcmpeqb %xmm1, %xmm0
>> - pmovmskb %xmm0, %eax
>> - test %eax, %eax
>> - jnz L(matches)
>> -
>> - movdqa 16(%rdi), %xmm2
>> - pcmpeqb %xmm1, %xmm2
>> - pmovmskb %xmm2, %eax
>> - test %eax, %eax
>> - jnz L(matches16)
>> -
>> - movdqa 32(%rdi), %xmm3
>> - pcmpeqb %xmm1, %xmm3
>> - pmovmskb %xmm3, %eax
>> - test %eax, %eax
>> - jnz L(matches32)
>> -
>> - movdqa 48(%rdi), %xmm3
>> - pcmpeqb %xmm1, %xmm3
>> - pmovmskb %xmm3, %eax
>> -
>> - add $64, %rdi
>> - test %eax, %eax
>> - jnz L(matches0)
>> -
>> - mov %rdi, %rcx
>> - and $-64, %rdi
>> - and $63, %rcx
>> - add %rcx, %rdx
>> -
>> - .p2align 4
>> -L(align64_loop):
>> - sub $64, %rdx
>> - jbe L(exit_loop)
>> - movdqa (%rdi), %xmm0
>> - movdqa 16(%rdi), %xmm2
>> - movdqa 32(%rdi), %xmm3
>> - movdqa 48(%rdi), %xmm4
>> -
>> - pcmpeqb %xmm1, %xmm0
>> - pcmpeqb %xmm1, %xmm2
>> - pcmpeqb %xmm1, %xmm3
>> - pcmpeqb %xmm1, %xmm4
>> -
>> - pmaxub %xmm0, %xmm3
>> - pmaxub %xmm2, %xmm4
>> +.p2align 4,,10
>> +.p2align 3
>> +L(return_null):
>> + xorl %eax, %eax
>> + ret
>> +.p2align 4,,10
>> +.p2align 4
>> +L(prepare_loop):
>> + movq %rdi, %rcx
>> + andq $-64, %rcx
>> + subq %rcx, %rdi
>> + leaq (%rdx, %rdi), %rsi
>> +.p2align 4,,10
>> +.p2align 3
>> +L(loop):
>> + subq $64, %rsi
>> + jbe L(return_null)
>> +
>> + movdqa 64(%rcx), %xmm0
>> + movdqa 80(%rcx), %xmm1
>> + movdqa 96(%rcx), %xmm3
>> + movdqa 112(%rcx), %xmm4
>> +
>> + pcmpeqb %xmm2, %xmm0
>> + pcmpeqb %xmm2, %xmm1
>> + pcmpeqb %xmm2, %xmm3
>> + pcmpeqb %xmm2, %xmm4
>> +
>> + pmaxub %xmm0, %xmm1
>> + pmaxub %xmm1, %xmm3
>> pmaxub %xmm3, %xmm4
>> - pmovmskb %xmm4, %eax
>> -
>> - add $64, %rdi
>> -
>> - test %eax, %eax
>> - jz L(align64_loop)
>> -
>> - sub $64, %rdi
>> -
>> + addq $64, %rcx
>> + pmovmskb %xmm4, %edx
>> + testl %edx, %edx
>> + je L(loop)
>> + pmovmskb %xmm3, %r8d
>> + pmovmskb %xmm1, %edi
>> + salq $48, %rdx
>> pmovmskb %xmm0, %eax
>> - test %eax, %eax
>> - jnz L(matches)
>> -
>> - pmovmskb %xmm2, %eax
>> - test %eax, %eax
>> - jnz L(matches16)
>> -
>> - movdqa 32(%rdi), %xmm3
>> - pcmpeqb %xmm1, %xmm3
>> -
>> - pcmpeqb 48(%rdi), %xmm1
>> - pmovmskb %xmm3, %eax
>> - test %eax, %eax
>> - jnz L(matches32)
>> -
>> - pmovmskb %xmm1, %eax
>> - bsf %eax, %eax
>> - lea 48(%rdi, %rax), %rax
>> - ret
>> -
>> - .p2align 4
>> -L(exit_loop):
>> - add $32, %rdx
>> - jle L(exit_loop_32)
>> -
>> - movdqa (%rdi), %xmm0
>> - pcmpeqb %xmm1, %xmm0
>> - pmovmskb %xmm0, %eax
>> - test %eax, %eax
>> - jnz L(matches)
>> -
>> - movdqa 16(%rdi), %xmm2
>> - pcmpeqb %xmm1, %xmm2
>> - pmovmskb %xmm2, %eax
>> - test %eax, %eax
>> - jnz L(matches16)
>> -
>> - movdqa 32(%rdi), %xmm3
>> - pcmpeqb %xmm1, %xmm3
>> - pmovmskb %xmm3, %eax
>> - test %eax, %eax
>> - jnz L(matches32_1)
>> - sub $16, %rdx
>> - jle L(return_null)
>> -
>> - pcmpeqb 48(%rdi), %xmm1
>> - pmovmskb %xmm1, %eax
>> - test %eax, %eax
>> - jnz L(matches48_1)
>> - xor %rax, %rax
>> - ret
>> -
>> - .p2align 4
>> -L(exit_loop_32):
>> - add $32, %rdx
>> - movdqa (%rdi), %xmm0
>> - pcmpeqb %xmm1, %xmm0
>> - pmovmskb %xmm0, %eax
>> - test %eax, %eax
>> - jnz L(matches_1)
>> - sub $16, %rdx
>> - jbe L(return_null)
>> -
>> - pcmpeqb 16(%rdi), %xmm1
>> - pmovmskb %xmm1, %eax
>> - test %eax, %eax
>> - jnz L(matches16_1)
>> - xor %rax, %rax
>> - ret
>> -
>> - .p2align 4
>> -L(matches0):
>> - bsf %eax, %eax
>> - lea -16(%rax, %rdi), %rax
>> - ret
>> -
>> - .p2align 4
>> -L(matches):
>> - bsf %eax, %eax
>> - add %rdi, %rax
>> - ret
>> -
>> - .p2align 4
>> -L(matches16):
>> - bsf %eax, %eax
>> - lea 16(%rax, %rdi), %rax
>> - ret
>> -
>> - .p2align 4
>> -L(matches32):
>> - bsf %eax, %eax
>> - lea 32(%rax, %rdi), %rax
>> - ret
>> -
>> - .p2align 4
>> -L(matches_1):
>> - bsf %eax, %eax
>> - sub %rax, %rdx
>> - jbe L(return_null)
>> - add %rdi, %rax
>> - ret
>> -
>> - .p2align 4
>> -L(matches16_1):
>> - bsf %eax, %eax
>> - sub %rax, %rdx
>> + salq $32, %r8
>> + sal $16, %edi
>> + or %edi, %eax
>> + orq %r8, %rax
>> + orq %rax, %rdx
>> + bsfq %rdx, %rax
>> + cmp %rax, %rsi
>> jbe L(return_null)
>> - lea 16(%rdi, %rax), %rax
>> + addq %rcx, %rax
>> ret
>>
>> - .p2align 4
>> -L(matches32_1):
>> - bsf %eax, %eax
>> - sub %rax, %rdx
>> - jbe L(return_null)
>> - lea 32(%rdi, %rax), %rax
>> - ret
>> -
>> - .p2align 4
>> -L(matches48_1):
>> - bsf %eax, %eax
>> - sub %rax, %rdx
>> - jbe L(return_null)
>> - lea 48(%rdi, %rax), %rax
>> - ret
>> -
>> - .p2align 4
>> -L(return_null):
>> - xor %rax, %rax
>> - ret
>> +.p2align 4,,10
>> +.p2align 3
>> +L(cross_page):
>> + movq %rdi, %rsi
>> + andq $-64, %rsi
>> + movdqa (%rsi), %xmm1
>> + pcmpeqb %xmm2, %xmm1
>> + pmovmskb %xmm1, %ecx
>> + movdqa 16(%rsi), %xmm1
>> + pcmpeqb %xmm2, %xmm1
>> + pmovmskb %xmm1, %eax
>> + movdqa 32(%rsi), %xmm1
>> + pcmpeqb %xmm2, %xmm1
>> + sal $16, %eax
>> + movdqa %xmm2, %xmm0
>> + pcmpeqb 48(%rsi), %xmm0
>> + pmovmskb %xmm1, %r8d
>> + pmovmskb %xmm0, %r9d
>> + salq $32, %r8
>> + salq $48, %r9
>> + or %ecx, %eax
>> + orq %r9, %rax
>> + orq %r8, %rax
>> + movq %rdi, %rcx
>> + subq %rsi, %rcx
>> + shrq %cl, %rax
>> + testq %rax, %rax
>> + jne L(return)
>> + jmp L(prepare_loop)
>> END(memchr)
>>
>> strong_alias (memchr, __memchr)
Looks good on Haswell and Skylake.
--
Andrew
@@ -1,5 +1,4 @@
-/* Copyright (C) 2011-2015 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
+/* Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -18,292 +17,134 @@
#include <sysdep.h>
-/* fast SSE2 version with using pmaxub and 64 byte loop */
+/* fast SSE2 version with using 64 byte loop */
.text
ENTRY(memchr)
- movd %rsi, %xmm1
- mov %rdi, %rcx
-
- punpcklbw %xmm1, %xmm1
- test %rdx, %rdx
- jz L(return_null)
- punpcklbw %xmm1, %xmm1
-
- and $63, %rcx
- pshufd $0, %xmm1, %xmm1
-
- cmp $48, %rcx
- ja L(crosscache)
-
- movdqu (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
-
- jnz L(matches_1)
- sub $16, %rdx
- jbe L(return_null)
- add $16, %rdi
- and $15, %rcx
- and $-16, %rdi
- add %rcx, %rdx
- sub $64, %rdx
- jbe L(exit_loop)
- jmp L(loop_prolog)
-
- .p2align 4
-L(crosscache):
- and $15, %rcx
- and $-16, %rdi
- movdqa (%rdi), %xmm0
-
- pcmpeqb %xmm1, %xmm0
-/* Check if there is a match. */
- pmovmskb %xmm0, %eax
-/* Remove the leading bytes. */
- sar %cl, %eax
- test %eax, %eax
- je L(unaligned_no_match)
-/* Check which byte is a match. */
+ movd %esi, %xmm2
+ testq %rdx, %rdx
+ punpcklbw %xmm2, %xmm2
+ punpcklwd %xmm2, %xmm2
+ pshufd $0, %xmm2, %xmm2
+ je L(return_null)
+ movl %edi, %eax
+ andl $4095, %eax
+ cmpl $4032, %eax
+ jg L(cross_page)
+ movdqu (%rdi), %xmm1
+ pcmpeqb %xmm2, %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ je L(next_48_bytes)
bsf %eax, %eax
-
- sub %rax, %rdx
+ cmpq %rax, %rdx
jbe L(return_null)
- add %rdi, %rax
- add %rcx, %rax
- ret
-
- .p2align 4
-L(unaligned_no_match):
- add %rcx, %rdx
- sub $16, %rdx
+ addq %rdi, %rax
+ ret
+.p2align 4,,10
+.p2align 3
+L(next_48_bytes):
+ movdqu 16(%rdi), %xmm1
+ movdqu 32(%rdi), %xmm3
+ pcmpeqb %xmm2, %xmm1
+ pcmpeqb %xmm2, %xmm3
+ movdqu 48(%rdi), %xmm4
+ pmovmskb %xmm1, %esi
+ pmovmskb %xmm3, %ecx
+ pcmpeqb %xmm2, %xmm4
+ pmovmskb %xmm4, %eax
+ salq $32, %rcx
+ sal $16, %esi
+ orq %rsi, %rcx
+ salq $48, %rax
+ orq %rcx, %rax
+ je L(prepare_loop)
+L(return):
+ bsf %rax, %rax
+ cmpq %rax, %rdx
jbe L(return_null)
- add $16, %rdi
- sub $64, %rdx
- jbe L(exit_loop)
-
- .p2align 4
-L(loop_prolog):
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
+ addq %rdi, %rax
+ ret
- movdqa 16(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32)
-
- movdqa 48(%rdi), %xmm4
- pcmpeqb %xmm1, %xmm4
- add $64, %rdi
- pmovmskb %xmm4, %eax
- test %eax, %eax
- jnz L(matches0)
-
- test $0x3f, %rdi
- jz L(align64_loop)
-
- sub $64, %rdx
- jbe L(exit_loop)
-
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
-
- movdqa 16(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32)
-
- movdqa 48(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
-
- add $64, %rdi
- test %eax, %eax
- jnz L(matches0)
-
- mov %rdi, %rcx
- and $-64, %rdi
- and $63, %rcx
- add %rcx, %rdx
-
- .p2align 4
-L(align64_loop):
- sub $64, %rdx
- jbe L(exit_loop)
- movdqa (%rdi), %xmm0
- movdqa 16(%rdi), %xmm2
- movdqa 32(%rdi), %xmm3
- movdqa 48(%rdi), %xmm4
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm1, %xmm2
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm1, %xmm4
-
- pmaxub %xmm0, %xmm3
- pmaxub %xmm2, %xmm4
+.p2align 4,,10
+.p2align 3
+L(return_null):
+ xorl %eax, %eax
+ ret
+.p2align 4,,10
+.p2align 4
+L(prepare_loop):
+ movq %rdi, %rcx
+ andq $-64, %rcx
+ subq %rcx, %rdi
+ leaq (%rdx, %rdi), %rsi
+.p2align 4,,10
+.p2align 3
+L(loop):
+ subq $64, %rsi
+ jbe L(return_null)
+
+ movdqa 64(%rcx), %xmm0
+ movdqa 80(%rcx), %xmm1
+ movdqa 96(%rcx), %xmm3
+ movdqa 112(%rcx), %xmm4
+
+ pcmpeqb %xmm2, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ pcmpeqb %xmm2, %xmm3
+ pcmpeqb %xmm2, %xmm4
+
+ pmaxub %xmm0, %xmm1
+ pmaxub %xmm1, %xmm3
pmaxub %xmm3, %xmm4
- pmovmskb %xmm4, %eax
-
- add $64, %rdi
-
- test %eax, %eax
- jz L(align64_loop)
-
- sub $64, %rdi
-
+ addq $64, %rcx
+ pmovmskb %xmm4, %edx
+ testl %edx, %edx
+ je L(loop)
+ pmovmskb %xmm3, %r8d
+ pmovmskb %xmm1, %edi
+ salq $48, %rdx
pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
-
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
-
- pcmpeqb 48(%rdi), %xmm1
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32)
-
- pmovmskb %xmm1, %eax
- bsf %eax, %eax
- lea 48(%rdi, %rax), %rax
- ret
-
- .p2align 4
-L(exit_loop):
- add $32, %rdx
- jle L(exit_loop_32)
-
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
-
- movdqa 16(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32_1)
- sub $16, %rdx
- jle L(return_null)
-
- pcmpeqb 48(%rdi), %xmm1
- pmovmskb %xmm1, %eax
- test %eax, %eax
- jnz L(matches48_1)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(exit_loop_32):
- add $32, %rdx
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches_1)
- sub $16, %rdx
- jbe L(return_null)
-
- pcmpeqb 16(%rdi), %xmm1
- pmovmskb %xmm1, %eax
- test %eax, %eax
- jnz L(matches16_1)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(matches0):
- bsf %eax, %eax
- lea -16(%rax, %rdi), %rax
- ret
-
- .p2align 4
-L(matches):
- bsf %eax, %eax
- add %rdi, %rax
- ret
-
- .p2align 4
-L(matches16):
- bsf %eax, %eax
- lea 16(%rax, %rdi), %rax
- ret
-
- .p2align 4
-L(matches32):
- bsf %eax, %eax
- lea 32(%rax, %rdi), %rax
- ret
-
- .p2align 4
-L(matches_1):
- bsf %eax, %eax
- sub %rax, %rdx
- jbe L(return_null)
- add %rdi, %rax
- ret
-
- .p2align 4
-L(matches16_1):
- bsf %eax, %eax
- sub %rax, %rdx
+ salq $32, %r8
+ sal $16, %edi
+ or %edi, %eax
+ orq %r8, %rax
+ orq %rax, %rdx
+ bsfq %rdx, %rax
+ cmp %rax, %rsi
jbe L(return_null)
- lea 16(%rdi, %rax), %rax
+ addq %rcx, %rax
ret
- .p2align 4
-L(matches32_1):
- bsf %eax, %eax
- sub %rax, %rdx
- jbe L(return_null)
- lea 32(%rdi, %rax), %rax
- ret
-
- .p2align 4
-L(matches48_1):
- bsf %eax, %eax
- sub %rax, %rdx
- jbe L(return_null)
- lea 48(%rdi, %rax), %rax
- ret
-
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
+.p2align 4,,10
+.p2align 3
+L(cross_page):
+ movq %rdi, %rsi
+ andq $-64, %rsi
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm2, %xmm1
+ pmovmskb %xmm1, %ecx
+ movdqa 16(%rsi), %xmm1
+ pcmpeqb %xmm2, %xmm1
+ pmovmskb %xmm1, %eax
+ movdqa 32(%rsi), %xmm1
+ pcmpeqb %xmm2, %xmm1
+ sal $16, %eax
+ movdqa %xmm2, %xmm0
+ pcmpeqb 48(%rsi), %xmm0
+ pmovmskb %xmm1, %r8d
+ pmovmskb %xmm0, %r9d
+ salq $32, %r8
+ salq $48, %r9
+ or %ecx, %eax
+ orq %r9, %rax
+ orq %r8, %rax
+ movq %rdi, %rcx
+ subq %rsi, %rcx
+ shrq %cl, %rax
+ testq %rax, %rax
+ jne L(return)
+ jmp L(prepare_loop)
END(memchr)
strong_alias (memchr, __memchr)