[neleai/string-x64] Reoptimize strlen and strnlen
Commit Message
Hi,
I optimized strlen long ago, then my main focus was improve performance
for core2 and have reasonable performance for athlons and old atoms.
Main change is that I check 16-64th byte unaligned instead aligning
these to 16 bytes. That improved performance on older processors but now
unaligned loads are better on i7. I don't remember if last time I keept
xoring first four xmm registers when checking unaligned loads or read
from (%rax) instead (%rdi) which increased latency but now simple
unaligned loads are faster also on core2
Then I made several microoptimizations like using edx instead rdx to
save space or reorder to improve instruction scheduling.
Also I tested avx2 version, again it doesn't help much, on haswell
performance difference is 0.2% while new sse2 is 1% faster on haswell.
Full graphs are here, only problem I could find is 0.3% decrease on
fx10.
I could reintroduce ifunc to handle atom and avx2 but is that worth it?
http://kam.mff.cuni.cz/~ondra/benchmark_string/strlen_profile.html
Ok to commit this?
* sysdeps/x86_64/strlen.S (strlen): Add microoptimizations.
---
sysdeps/x86_64/strlen.S | 336 ++++++++++++++++++++++++------------------------
1 file changed, 169 insertions(+), 167 deletions(-)
Comments
On Fri, Jun 26, 2015 at 09:12:54AM +0200, Ondřej Bílka wrote:
> Hi,
>
> I optimized strlen long ago, then my main focus was improve performance
> for core2 and have reasonable performance for athlons and old atoms.
>
> Main change is that I check 16-64th byte unaligned instead aligning
> these to 16 bytes. That improved performance on older processors but now
> unaligned loads are better on i7. I don't remember if last time I keept
> xoring first four xmm registers when checking unaligned loads or read
> from (%rax) instead (%rdi) which increased latency but now simple
> unaligned loads are faster also on core2
>
> Then I made several microoptimizations like using edx instead rdx to
> save space or reorder to improve instruction scheduling.
>
> Also I tested avx2 version, again it doesn't help much, on haswell
> performance difference is 0.2% while new sse2 is 1% faster on haswell.
>
> Full graphs are here, only problem I could find is 0.3% decrease on
> fx10.
>
> I could reintroduce ifunc to handle atom and avx2 but is that worth it?
>
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strlen_profile.html
>
> Ok to commit this?
>
> * sysdeps/x86_64/strlen.S (strlen): Add microoptimizations.
> ---
> sysdeps/x86_64/strlen.S | 336 ++++++++++++++++++++++++------------------------
> 1 file changed, 169 insertions(+), 167 deletions(-)
>
> diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> index c382c8d..3e8beb0 100644
> --- a/sysdeps/x86_64/strlen.S
> +++ b/sysdeps/x86_64/strlen.S
> @@ -1,5 +1,5 @@
> /* SSE2 version of strlen.
> - Copyright (C) 2012-2015 Free Software Foundation, Inc.
> + Copyright (C) 2012-2015 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> The GNU C Library is free software; you can redistribute it and/or
> @@ -18,222 +18,224 @@
>
> #include <sysdep.h>
>
> -/* Long lived register in strlen(s), strnlen(s, n) are:
> -
> - %xmm11 - zero
> - %rdi - s
> - %r10 (s+n) & (~(64-1))
> - %r11 s+n
> -*/
>
>
> .text
> ENTRY(strlen)
> -
> -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
> -#define FIND_ZERO \
> - pcmpeqb (%rax), %xmm8; \
> - pcmpeqb 16(%rax), %xmm9; \
> - pcmpeqb 32(%rax), %xmm10; \
> - pcmpeqb 48(%rax), %xmm11; \
> - pmovmskb %xmm8, %esi; \
> - pmovmskb %xmm9, %edx; \
> - pmovmskb %xmm10, %r8d; \
> - pmovmskb %xmm11, %ecx; \
> - salq $16, %rdx; \
> - salq $16, %rcx; \
> - orq %rsi, %rdx; \
> - orq %r8, %rcx; \
> - salq $32, %rcx; \
> - orq %rcx, %rdx;
> -
> #ifdef AS_STRNLEN
> -/* Do not read anything when n==0. */
> + mov %rsi, %r8
> + xor %edx, %edx
> test %rsi, %rsi
> - jne L(n_nonzero)
> - xor %rax, %rax
> - ret
> -L(n_nonzero):
> -
> -/* Initialize long lived registers. */
> -
> - add %rdi, %rsi
> - mov %rsi, %r10
> - and $-64, %r10
> - mov %rsi, %r11
> + je L(return_zero)
> + cmp $64, %rsi
> + jae L(dont_set)
> + bts %rsi, %rdx
> +L(dont_set):
> #endif
> -
> - pxor %xmm8, %xmm8
> - pxor %xmm9, %xmm9
> - pxor %xmm10, %xmm10
> - pxor %xmm11, %xmm11
> - movq %rdi, %rax
> - movq %rdi, %rcx
> - andq $4095, %rcx
> -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
> - cmpq $4047, %rcx
> -/* We cannot unify this branching as it would be ~6 cycles slower. */
> + pxor %xmm0, %xmm0
> + mov %edi, %ecx
> + and $4095, %ecx
> + cmp $4032, %ecx
> ja L(cross_page)
> -
> + movdqu (%rdi), %xmm4
> + pcmpeqb %xmm0, %xmm4
> + pmovmskb %xmm4, %ecx
> #ifdef AS_STRNLEN
> -/* Test if end is among first 64 bytes. */
> -# define STRNLEN_PROLOG \
> - mov %r11, %rsi; \
> - subq %rax, %rsi; \
> - andq $-64, %rax; \
> - testq $-64, %rsi; \
> - je L(strnlen_ret)
> + or %dx, %cx
> #else
> -# define STRNLEN_PROLOG andq $-64, %rax;
> + test %ecx, %ecx
> #endif
> -
> -/* Ignore bits in mask that come before start of string. */
> -#define PROLOG(lab) \
> - movq %rdi, %rcx; \
> - xorq %rax, %rcx; \
> - STRNLEN_PROLOG; \
> - sarq %cl, %rdx; \
> - test %rdx, %rdx; \
> - je L(lab); \
> - bsfq %rdx, %rax; \
> + je L(next48_bytes)
> + bsf %ecx, %eax
> ret
>
> #ifdef AS_STRNLEN
> - andq $-16, %rax
> - FIND_ZERO
> -#else
> - /* Test first 16 bytes unaligned. */
> - movdqu (%rax), %xmm12
> - pcmpeqb %xmm8, %xmm12
> - pmovmskb %xmm12, %edx
> - test %edx, %edx
> - je L(next48_bytes)
> - bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
> +L(return_zero):
> + xor %eax, %eax
> ret
> -
> +L(return_noread):
> + add $64, %rax
> + sub %rdi, %rax
> + ret
> +#endif
> + .p2align 4
> L(next48_bytes):
> -/* Same as FIND_ZERO except we do not check first 16 bytes. */
> - andq $-16, %rax
> - pcmpeqb 16(%rax), %xmm9
> - pcmpeqb 32(%rax), %xmm10
> - pcmpeqb 48(%rax), %xmm11
> - pmovmskb %xmm9, %edx
> - pmovmskb %xmm10, %r8d
> - pmovmskb %xmm11, %ecx
> - salq $16, %rdx
> - salq $16, %rcx
> - orq %r8, %rcx
> + movdqu 16(%rdi), %xmm1
> + movdqu 32(%rdi), %xmm2
> + movdqu 48(%rdi), %xmm3
> + pcmpeqb %xmm0, %xmm1
> + pcmpeqb %xmm0, %xmm2
> + pcmpeqb %xmm0, %xmm3
> +#ifdef AS_STRNLEN
> + pmovmskb %xmm1, %ecx
> + sal $16, %ecx
> + or %rcx, %rdx
> +#else
> + pmovmskb %xmm1, %edx
> + sal $16, %edx
> +#endif
> + pmovmskb %xmm2, %esi
> + pmovmskb %xmm3, %ecx
> + sal $16, %ecx
> + or %esi, %ecx
> salq $32, %rcx
> orq %rcx, %rdx
> -#endif
> -
> - /* When no zero byte is found xmm9-11 are zero so we do not have to
> - zero them. */
> - PROLOG(loop)
> + je L(loop_init)
> + bsfq %rdx, %rax
> + ret
>
> .p2align 4
> L(cross_page):
> - andq $-64, %rax
> - FIND_ZERO
> - PROLOG(loop_init)
>
> + movq %rdi, %rax
> + pxor %xmm1, %xmm1
> + pxor %xmm2, %xmm2
> + pxor %xmm3, %xmm3
> #ifdef AS_STRNLEN
> -/* We must do this check to correctly handle strnlen (s, -1). */
> -L(strnlen_ret):
> - bts %rsi, %rdx
> + mov %rdx, %r9
> +#endif
> + andq $-64, %rax
> + pcmpeqb (%rax), %xmm0
> + pcmpeqb 16(%rax), %xmm1
> + pcmpeqb 32(%rax), %xmm2
> + pcmpeqb 48(%rax), %xmm3
> + pmovmskb %xmm0, %esi
> + pxor %xmm0, %xmm0
> + pmovmskb %xmm1, %edx
> + pmovmskb %xmm2, %r10d
> + pmovmskb %xmm3, %ecx
> + sal $16, %edx
> + sal $16, %ecx
> + or %esi, %edx
> + or %r10, %rcx
> + salq $32, %rcx
> + orq %rcx, %rdx
> + mov %edi, %ecx
> +#ifdef AS_STRNLEN
> + salq %cl, %r9
> + or %r9, %rdx
> +#endif
> sarq %cl, %rdx
> test %rdx, %rdx
> je L(loop_init)
> bsfq %rdx, %rax
> ret
> -#endif
> .p2align 4
> L(loop_init):
> - pxor %xmm9, %xmm9
> - pxor %xmm10, %xmm10
> - pxor %xmm11, %xmm11
> + movq %rdi, %rax
> + andq $-64, %rax
> #ifdef AS_STRNLEN
> + add %rdi, %r8
> + sub %rax, %r8
> + cmp $64, %r8
> + je L(return_noread)
> +#endif
> + pxor %xmm1, %xmm1
> + pxor %xmm2, %xmm2
> +#ifdef USE_AVX2
> + vpxor %xmm0, %xmm0, %xmm0
> +#endif
> .p2align 4
> L(loop):
> +#ifdef USE_AVX2
> + vmovdqa 64(%rax), %ymm1
> + vpminub 96(%rax), %ymm1, %ymm2
> + vpcmpeqb %ymm0, %ymm2, %ymm2
> + vpmovmskb %ymm2, %edx
> +#else
> + movdqa 64(%rax), %xmm5
> + pminub 80(%rax), %xmm5
> + pminub 96(%rax), %xmm5
> + pminub 112(%rax), %xmm5
> + pcmpeqb %xmm0, %xmm5
> + pmovmskb %xmm5, %edx
> +#endif
>
> - addq $64, %rax
> - cmpq %rax, %r10
> - je L(exit_end)
> -
> - movdqa (%rax), %xmm8
> - pminub 16(%rax), %xmm8
> - pminub 32(%rax), %xmm8
> - pminub 48(%rax), %xmm8
> - pcmpeqb %xmm11, %xmm8
> - pmovmskb %xmm8, %edx
> +#ifdef AS_STRNLEN
> + sub $64, %r8
> testl %edx, %edx
> - jne L(exit)
> - jmp L(loop)
> -
> - .p2align 4
> -L(exit_end):
> - cmp %rax, %r11
> - je L(first) /* Do not read when end is at page boundary. */
> - pxor %xmm8, %xmm8
> - FIND_ZERO
> -
> -L(first):
> - bts %r11, %rdx
> - bsfq %rdx, %rdx
> - addq %rdx, %rax
> - subq %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(exit):
> - pxor %xmm8, %xmm8
> - FIND_ZERO
> -
> - bsfq %rdx, %rdx
> - addq %rdx, %rax
> - subq %rdi, %rax
> - ret
> -
> + jne L(exit64)
> + cmp $64, %r8
> + jbe L(exit64_zero)
> #else
> -
> - /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
> - .p2align 4
> -L(loop):
> -
> - movdqa 64(%rax), %xmm8
> - pminub 80(%rax), %xmm8
> - pminub 96(%rax), %xmm8
> - pminub 112(%rax), %xmm8
> - pcmpeqb %xmm11, %xmm8
> - pmovmskb %xmm8, %edx
> testl %edx, %edx
> jne L(exit64)
> +#endif
>
> subq $-128, %rax
> -
> - movdqa (%rax), %xmm8
> - pminub 16(%rax), %xmm8
> - pminub 32(%rax), %xmm8
> - pminub 48(%rax), %xmm8
> - pcmpeqb %xmm11, %xmm8
> - pmovmskb %xmm8, %edx
> +#ifdef USE_AVX2
> + vmovdqa (%rax), %ymm1
> + vpminub 32(%rax), %ymm1, %ymm2
> + vpcmpeqb %ymm0, %ymm2, %ymm2
> + vpmovmskb %ymm2, %edx
> +#else
> + movdqa (%rax), %xmm5
> + pminub 16(%rax), %xmm5
> + pminub 32(%rax), %xmm5
> + pminub 48(%rax), %xmm5
> + pcmpeqb %xmm0, %xmm5
> + pmovmskb %xmm5, %edx
> +#endif
> +#ifdef AS_STRNLEN
> + sub $64, %r8
> testl %edx, %edx
> jne L(exit0)
> + cmp $64, %r8
> + jbe L(exit0_zero)
> +#else
> + testl %edx, %edx
> + jne L(exit0)
> +#endif
> jmp L(loop)
>
> +#ifdef AS_STRNLEN
> + .p2align 4
> +L(exit64_zero):
> + addq $64, %rax
> +L(exit0_zero):
> + add %r8, %rax
> + sub %rdi, %rax
> + ret
> +#endif
> .p2align 4
> +
> +
> L(exit64):
> addq $64, %rax
> L(exit0):
> - pxor %xmm8, %xmm8
> - FIND_ZERO
> -
> +#ifdef USE_AVX2
> + sal $32, %rdx
> +#else
> + sal $48, %rdx
> +#endif
> +#ifdef AS_STRNLEN
> + cmp $64, %r8
> + jae L(dont_set2)
> + bts %r8, %rdx
> + L(dont_set2):
> +#endif
> +#ifdef USE_AVX2
> + subq %rdi, %rax
> + vpcmpeqb %ymm0, %ymm1, %ymm1
> + vpmovmskb %ymm1, %ecx
> + vzeroupper
> + or %rcx, %rdx
> +#else
> + pcmpeqb (%rax), %xmm0
> + pcmpeqb 16(%rax), %xmm1
> + pcmpeqb 32(%rax), %xmm2
> + subq %rdi, %rax
> + pmovmskb %xmm0, %esi
> + pmovmskb %xmm1, %ecx
> + pmovmskb %xmm2, %r8d
> + sal $16, %ecx
> + or %esi, %ecx
> + salq $32, %r8
> + orq %r8, %rcx
> + orq %rcx, %rdx
> +#endif
> bsfq %rdx, %rdx
> addq %rdx, %rax
> - subq %rdi, %rax
> ret
> -
> -#endif
> -
> END(strlen)
> libc_hidden_builtin_def (strlen)
> --
> 1.8.4.rc3
@@ -1,5 +1,5 @@
/* SSE2 version of strlen.
- Copyright (C) 2012-2015 Free Software Foundation, Inc.
+ Copyright (C) 2012-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -18,222 +18,224 @@
#include <sysdep.h>
-/* Long lived register in strlen(s), strnlen(s, n) are:
-
- %xmm11 - zero
- %rdi - s
- %r10 (s+n) & (~(64-1))
- %r11 s+n
-*/
.text
ENTRY(strlen)
-
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
-#define FIND_ZERO \
- pcmpeqb (%rax), %xmm8; \
- pcmpeqb 16(%rax), %xmm9; \
- pcmpeqb 32(%rax), %xmm10; \
- pcmpeqb 48(%rax), %xmm11; \
- pmovmskb %xmm8, %esi; \
- pmovmskb %xmm9, %edx; \
- pmovmskb %xmm10, %r8d; \
- pmovmskb %xmm11, %ecx; \
- salq $16, %rdx; \
- salq $16, %rcx; \
- orq %rsi, %rdx; \
- orq %r8, %rcx; \
- salq $32, %rcx; \
- orq %rcx, %rdx;
-
#ifdef AS_STRNLEN
-/* Do not read anything when n==0. */
+ mov %rsi, %r8
+ xor %edx, %edx
test %rsi, %rsi
- jne L(n_nonzero)
- xor %rax, %rax
- ret
-L(n_nonzero):
-
-/* Initialize long lived registers. */
-
- add %rdi, %rsi
- mov %rsi, %r10
- and $-64, %r10
- mov %rsi, %r11
+ je L(return_zero)
+ cmp $64, %rsi
+ jae L(dont_set)
+ bts %rsi, %rdx
+L(dont_set):
#endif
-
- pxor %xmm8, %xmm8
- pxor %xmm9, %xmm9
- pxor %xmm10, %xmm10
- pxor %xmm11, %xmm11
- movq %rdi, %rax
- movq %rdi, %rcx
- andq $4095, %rcx
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
- cmpq $4047, %rcx
-/* We cannot unify this branching as it would be ~6 cycles slower. */
+ pxor %xmm0, %xmm0
+ mov %edi, %ecx
+ and $4095, %ecx
+ cmp $4032, %ecx
ja L(cross_page)
-
+ movdqu (%rdi), %xmm4
+ pcmpeqb %xmm0, %xmm4
+ pmovmskb %xmm4, %ecx
#ifdef AS_STRNLEN
-/* Test if end is among first 64 bytes. */
-# define STRNLEN_PROLOG \
- mov %r11, %rsi; \
- subq %rax, %rsi; \
- andq $-64, %rax; \
- testq $-64, %rsi; \
- je L(strnlen_ret)
+ or %dx, %cx
#else
-# define STRNLEN_PROLOG andq $-64, %rax;
+ test %ecx, %ecx
#endif
-
-/* Ignore bits in mask that come before start of string. */
-#define PROLOG(lab) \
- movq %rdi, %rcx; \
- xorq %rax, %rcx; \
- STRNLEN_PROLOG; \
- sarq %cl, %rdx; \
- test %rdx, %rdx; \
- je L(lab); \
- bsfq %rdx, %rax; \
+ je L(next48_bytes)
+ bsf %ecx, %eax
ret
#ifdef AS_STRNLEN
- andq $-16, %rax
- FIND_ZERO
-#else
- /* Test first 16 bytes unaligned. */
- movdqu (%rax), %xmm12
- pcmpeqb %xmm8, %xmm12
- pmovmskb %xmm12, %edx
- test %edx, %edx
- je L(next48_bytes)
- bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
+L(return_zero):
+ xor %eax, %eax
ret
-
+L(return_noread):
+ add $64, %rax
+ sub %rdi, %rax
+ ret
+#endif
+ .p2align 4
L(next48_bytes):
-/* Same as FIND_ZERO except we do not check first 16 bytes. */
- andq $-16, %rax
- pcmpeqb 16(%rax), %xmm9
- pcmpeqb 32(%rax), %xmm10
- pcmpeqb 48(%rax), %xmm11
- pmovmskb %xmm9, %edx
- pmovmskb %xmm10, %r8d
- pmovmskb %xmm11, %ecx
- salq $16, %rdx
- salq $16, %rcx
- orq %r8, %rcx
+ movdqu 16(%rdi), %xmm1
+ movdqu 32(%rdi), %xmm2
+ movdqu 48(%rdi), %xmm3
+ pcmpeqb %xmm0, %xmm1
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm0, %xmm3
+#ifdef AS_STRNLEN
+ pmovmskb %xmm1, %ecx
+ sal $16, %ecx
+ or %rcx, %rdx
+#else
+ pmovmskb %xmm1, %edx
+ sal $16, %edx
+#endif
+ pmovmskb %xmm2, %esi
+ pmovmskb %xmm3, %ecx
+ sal $16, %ecx
+ or %esi, %ecx
salq $32, %rcx
orq %rcx, %rdx
-#endif
-
- /* When no zero byte is found xmm9-11 are zero so we do not have to
- zero them. */
- PROLOG(loop)
+ je L(loop_init)
+ bsfq %rdx, %rax
+ ret
.p2align 4
L(cross_page):
- andq $-64, %rax
- FIND_ZERO
- PROLOG(loop_init)
+ movq %rdi, %rax
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
#ifdef AS_STRNLEN
-/* We must do this check to correctly handle strnlen (s, -1). */
-L(strnlen_ret):
- bts %rsi, %rdx
+ mov %rdx, %r9
+#endif
+ andq $-64, %rax
+ pcmpeqb (%rax), %xmm0
+ pcmpeqb 16(%rax), %xmm1
+ pcmpeqb 32(%rax), %xmm2
+ pcmpeqb 48(%rax), %xmm3
+ pmovmskb %xmm0, %esi
+ pxor %xmm0, %xmm0
+ pmovmskb %xmm1, %edx
+ pmovmskb %xmm2, %r10d
+ pmovmskb %xmm3, %ecx
+ sal $16, %edx
+ sal $16, %ecx
+ or %esi, %edx
+ or %r10, %rcx
+ salq $32, %rcx
+ orq %rcx, %rdx
+ mov %edi, %ecx
+#ifdef AS_STRNLEN
+ salq %cl, %r9
+ or %r9, %rdx
+#endif
sarq %cl, %rdx
test %rdx, %rdx
je L(loop_init)
bsfq %rdx, %rax
ret
-#endif
.p2align 4
L(loop_init):
- pxor %xmm9, %xmm9
- pxor %xmm10, %xmm10
- pxor %xmm11, %xmm11
+ movq %rdi, %rax
+ andq $-64, %rax
#ifdef AS_STRNLEN
+ add %rdi, %r8
+ sub %rax, %r8
+ cmp $64, %r8
+ je L(return_noread)
+#endif
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+#ifdef USE_AVX2
+ vpxor %xmm0, %xmm0, %xmm0
+#endif
.p2align 4
L(loop):
+#ifdef USE_AVX2
+ vmovdqa 64(%rax), %ymm1
+ vpminub 96(%rax), %ymm1, %ymm2
+ vpcmpeqb %ymm0, %ymm2, %ymm2
+ vpmovmskb %ymm2, %edx
+#else
+ movdqa 64(%rax), %xmm5
+ pminub 80(%rax), %xmm5
+ pminub 96(%rax), %xmm5
+ pminub 112(%rax), %xmm5
+ pcmpeqb %xmm0, %xmm5
+ pmovmskb %xmm5, %edx
+#endif
- addq $64, %rax
- cmpq %rax, %r10
- je L(exit_end)
-
- movdqa (%rax), %xmm8
- pminub 16(%rax), %xmm8
- pminub 32(%rax), %xmm8
- pminub 48(%rax), %xmm8
- pcmpeqb %xmm11, %xmm8
- pmovmskb %xmm8, %edx
+#ifdef AS_STRNLEN
+ sub $64, %r8
testl %edx, %edx
- jne L(exit)
- jmp L(loop)
-
- .p2align 4
-L(exit_end):
- cmp %rax, %r11
- je L(first) /* Do not read when end is at page boundary. */
- pxor %xmm8, %xmm8
- FIND_ZERO
-
-L(first):
- bts %r11, %rdx
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- ret
-
- .p2align 4
-L(exit):
- pxor %xmm8, %xmm8
- FIND_ZERO
-
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- ret
-
+ jne L(exit64)
+ cmp $64, %r8
+ jbe L(exit64_zero)
#else
-
- /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
- .p2align 4
-L(loop):
-
- movdqa 64(%rax), %xmm8
- pminub 80(%rax), %xmm8
- pminub 96(%rax), %xmm8
- pminub 112(%rax), %xmm8
- pcmpeqb %xmm11, %xmm8
- pmovmskb %xmm8, %edx
testl %edx, %edx
jne L(exit64)
+#endif
subq $-128, %rax
-
- movdqa (%rax), %xmm8
- pminub 16(%rax), %xmm8
- pminub 32(%rax), %xmm8
- pminub 48(%rax), %xmm8
- pcmpeqb %xmm11, %xmm8
- pmovmskb %xmm8, %edx
+#ifdef USE_AVX2
+ vmovdqa (%rax), %ymm1
+ vpminub 32(%rax), %ymm1, %ymm2
+ vpcmpeqb %ymm0, %ymm2, %ymm2
+ vpmovmskb %ymm2, %edx
+#else
+ movdqa (%rax), %xmm5
+ pminub 16(%rax), %xmm5
+ pminub 32(%rax), %xmm5
+ pminub 48(%rax), %xmm5
+ pcmpeqb %xmm0, %xmm5
+ pmovmskb %xmm5, %edx
+#endif
+#ifdef AS_STRNLEN
+ sub $64, %r8
testl %edx, %edx
jne L(exit0)
+ cmp $64, %r8
+ jbe L(exit0_zero)
+#else
+ testl %edx, %edx
+ jne L(exit0)
+#endif
jmp L(loop)
+#ifdef AS_STRNLEN
+ .p2align 4
+L(exit64_zero):
+ addq $64, %rax
+L(exit0_zero):
+ add %r8, %rax
+ sub %rdi, %rax
+ ret
+#endif
.p2align 4
+
+
L(exit64):
addq $64, %rax
L(exit0):
- pxor %xmm8, %xmm8
- FIND_ZERO
-
+#ifdef USE_AVX2
+ sal $32, %rdx
+#else
+ sal $48, %rdx
+#endif
+#ifdef AS_STRNLEN
+ cmp $64, %r8
+ jae L(dont_set2)
+ bts %r8, %rdx
+ L(dont_set2):
+#endif
+#ifdef USE_AVX2
+ subq %rdi, %rax
+ vpcmpeqb %ymm0, %ymm1, %ymm1
+ vpmovmskb %ymm1, %ecx
+ vzeroupper
+ or %rcx, %rdx
+#else
+ pcmpeqb (%rax), %xmm0
+ pcmpeqb 16(%rax), %xmm1
+ pcmpeqb 32(%rax), %xmm2
+ subq %rdi, %rax
+ pmovmskb %xmm0, %esi
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm2, %r8d
+ sal $16, %ecx
+ or %esi, %ecx
+ salq $32, %r8
+ orq %r8, %rcx
+ orq %rcx, %rdx
+#endif
bsfq %rdx, %rdx
addq %rdx, %rax
- subq %rdi, %rax
ret
-
-#endif
-
END(strlen)
libc_hidden_builtin_def (strlen)