[neleai/string-x64] Reoptimize strlen and strnlen

Message ID 20150626071254.GA1789@domone
State New, archived
Headers

Commit Message

Ondrej Bilka June 26, 2015, 7:12 a.m. UTC
  Hi,

I optimized strlen long ago, then my main focus was improve performance
for core2 and have reasonable performance for athlons and old atoms.

Main change is that I check 16-64th byte unaligned instead aligning
these to 16 bytes. That improved performance on older processors but now
unaligned loads are better on i7. I don't remember if last time I keept
xoring first four xmm registers when checking unaligned loads or read
from (%rax) instead (%rdi) which increased latency but now simple
unaligned loads are faster also on core2

Then I made several microoptimizations like using edx instead rdx to
save space or reorder to improve instruction scheduling.

Also I tested avx2 version, again it doesn't help much, on haswell
performance difference is 0.2% while new sse2 is 1% faster on haswell.

Full graphs are here, only problem I could find is 0.3% decrease on
fx10.

I could reintroduce ifunc to handle atom and avx2 but is that worth it?

http://kam.mff.cuni.cz/~ondra/benchmark_string/strlen_profile.html

Ok to commit this?

	* sysdeps/x86_64/strlen.S (strlen): Add microoptimizations.
---
 sysdeps/x86_64/strlen.S | 336 ++++++++++++++++++++++++------------------------
 1 file changed, 169 insertions(+), 167 deletions(-)
  

Comments

Ondrej Bilka July 3, 2015, 7:41 a.m. UTC | #1
On Fri, Jun 26, 2015 at 09:12:54AM +0200, Ondřej Bílka wrote:
> Hi,
> 
> I optimized strlen long ago, then my main focus was improve performance
> for core2 and have reasonable performance for athlons and old atoms.
> 
> Main change is that I check 16-64th byte unaligned instead aligning
> these to 16 bytes. That improved performance on older processors but now
> unaligned loads are better on i7. I don't remember if last time I keept
> xoring first four xmm registers when checking unaligned loads or read
> from (%rax) instead (%rdi) which increased latency but now simple
> unaligned loads are faster also on core2
> 
> Then I made several microoptimizations like using edx instead rdx to
> save space or reorder to improve instruction scheduling.
> 
> Also I tested avx2 version, again it doesn't help much, on haswell
> performance difference is 0.2% while new sse2 is 1% faster on haswell.
> 
> Full graphs are here, only problem I could find is 0.3% decrease on
> fx10.
> 
> I could reintroduce ifunc to handle atom and avx2 but is that worth it?
> 
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strlen_profile.html
> 
> Ok to commit this?
> 
> 	* sysdeps/x86_64/strlen.S (strlen): Add microoptimizations.
> ---
>  sysdeps/x86_64/strlen.S | 336 ++++++++++++++++++++++++------------------------
>  1 file changed, 169 insertions(+), 167 deletions(-)
> 
> diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> index c382c8d..3e8beb0 100644
> --- a/sysdeps/x86_64/strlen.S
> +++ b/sysdeps/x86_64/strlen.S
> @@ -1,5 +1,5 @@
>  /* SSE2 version of strlen.
> -   Copyright (C) 2012-2015 Free Software Foundation, Inc.
> +   Copyright (C) 2012-2015 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>  
>     The GNU C Library is free software; you can redistribute it and/or
> @@ -18,222 +18,224 @@
>  
>  #include <sysdep.h>
>  
> -/* Long lived register in strlen(s), strnlen(s, n) are:
> -
> -	%xmm11 - zero
> -	%rdi   - s
> -	%r10  (s+n) & (~(64-1))
> -	%r11   s+n
> -*/
>  
>  
>  .text
>  ENTRY(strlen)
> -
> -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
> -#define FIND_ZERO	\
> -	pcmpeqb	(%rax), %xmm8;	\
> -	pcmpeqb	16(%rax), %xmm9;	\
> -	pcmpeqb	32(%rax), %xmm10;	\
> -	pcmpeqb	48(%rax), %xmm11;	\
> -	pmovmskb	%xmm8, %esi;	\
> -	pmovmskb	%xmm9, %edx;	\
> -	pmovmskb	%xmm10, %r8d;	\
> -	pmovmskb	%xmm11, %ecx;	\
> -	salq	$16, %rdx;	\
> -	salq	$16, %rcx;	\
> -	orq	%rsi, %rdx;	\
> -	orq	%r8, %rcx;	\
> -	salq	$32, %rcx;	\
> -	orq	%rcx, %rdx;
> -
>  #ifdef AS_STRNLEN
> -/* Do not read anything when n==0.  */
> +	mov	%rsi, %r8
> +	xor	%edx, %edx
>  	test	%rsi, %rsi
> -	jne	L(n_nonzero)
> -	xor	%rax, %rax
> -	ret
> -L(n_nonzero):
> -
> -/* Initialize long lived registers.  */
> -
> -	add	%rdi, %rsi
> -	mov	%rsi, %r10
> -	and	$-64, %r10
> -	mov	%rsi, %r11
> +	je	L(return_zero)
> +	cmp	$64,  %rsi
> +	jae	L(dont_set)
> +	bts	%rsi, %rdx
> +L(dont_set):
>  #endif
> -
> -	pxor	%xmm8, %xmm8
> -	pxor	%xmm9, %xmm9
> -	pxor	%xmm10, %xmm10
> -	pxor	%xmm11, %xmm11
> -	movq	%rdi, %rax
> -	movq	%rdi, %rcx
> -	andq	$4095, %rcx
> -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
> -	cmpq	$4047, %rcx
> -/* We cannot unify this branching as it would be ~6 cycles slower.  */
> +	pxor	%xmm0, %xmm0
> +	mov	%edi, %ecx
> +	and	$4095, %ecx
> +	cmp	$4032, %ecx
>  	ja	L(cross_page)
> -
> +	movdqu	(%rdi), %xmm4
> +	pcmpeqb	%xmm0, %xmm4
> +	pmovmskb %xmm4, %ecx
>  #ifdef AS_STRNLEN
> -/* Test if end is among first 64 bytes.  */
> -# define STRNLEN_PROLOG	\
> -	mov	%r11, %rsi;	\
> -	subq	%rax, %rsi;	\
> -	andq	$-64, %rax;	\
> -	testq	$-64, %rsi;	\
> -	je	L(strnlen_ret)
> +	or	%dx, %cx
>  #else
> -# define STRNLEN_PROLOG  andq $-64, %rax;
> +	test	%ecx, %ecx
>  #endif
> -
> -/* Ignore bits in mask that come before start of string.  */
> -#define PROLOG(lab)	\
> -	movq	%rdi, %rcx;	\
> -	xorq	%rax, %rcx;	\
> -	STRNLEN_PROLOG;	\
> -	sarq	%cl, %rdx;	\
> -	test	%rdx, %rdx;	\
> -	je	L(lab);	\
> -	bsfq	%rdx, %rax;	\
> +	je	L(next48_bytes)
> +	bsf	%ecx, %eax
>  	ret
>  
>  #ifdef AS_STRNLEN
> -	andq	$-16, %rax
> -	FIND_ZERO
> -#else
> -	/* Test first 16 bytes unaligned.  */
> -	movdqu	(%rax), %xmm12
> -	pcmpeqb	%xmm8, %xmm12
> -	pmovmskb	%xmm12, %edx
> -	test	%edx, %edx
> -	je 	L(next48_bytes)
> -	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
> +L(return_zero):
> +	xor	%eax, %eax
>  	ret
> -
> +L(return_noread):
> +	add	$64, %rax
> +	sub	%rdi, %rax
> +	ret
> +#endif
> +	.p2align 4
>  L(next48_bytes):
> -/* Same as FIND_ZERO except we do not check first 16 bytes.  */
> -	andq	$-16, %rax
> -	pcmpeqb 16(%rax), %xmm9
> -	pcmpeqb 32(%rax), %xmm10
> -	pcmpeqb 48(%rax), %xmm11
> -	pmovmskb	%xmm9, %edx
> -	pmovmskb	%xmm10, %r8d
> -	pmovmskb	%xmm11, %ecx
> -	salq	$16, %rdx
> -	salq	$16, %rcx
> -	orq	%r8, %rcx
> +	movdqu	16(%rdi), %xmm1
> +	movdqu	32(%rdi), %xmm2
> +	movdqu	48(%rdi), %xmm3
> +	pcmpeqb	%xmm0, %xmm1
> +	pcmpeqb	%xmm0, %xmm2
> +	pcmpeqb	%xmm0, %xmm3
> +#ifdef AS_STRNLEN
> +	pmovmskb %xmm1, %ecx
> +	sal	$16, %ecx
> +	or	%rcx, %rdx
> +#else
> +	pmovmskb %xmm1, %edx
> +	sal	$16, %edx
> +#endif
> +	pmovmskb %xmm2, %esi
> +	pmovmskb %xmm3, %ecx
> +	sal	$16, %ecx
> +	or	%esi, %ecx
>  	salq	$32, %rcx
>  	orq	%rcx, %rdx
> -#endif
> -
> -	/* When no zero byte is found xmm9-11 are zero so we do not have to
> -	   zero them.  */
> -	PROLOG(loop)
> +	je	L(loop_init)
> +	bsfq	%rdx, %rax
> +	ret
>  
>  	.p2align 4
>  L(cross_page):
> -	andq	$-64, %rax
> -	FIND_ZERO
> -	PROLOG(loop_init)
>  
> +	movq	%rdi, %rax
> +	pxor	%xmm1, %xmm1
> +	pxor	%xmm2, %xmm2
> +	pxor	%xmm3, %xmm3
>  #ifdef AS_STRNLEN
> -/* We must do this check to correctly handle strnlen (s, -1).  */
> -L(strnlen_ret):
> -	bts	%rsi, %rdx
> +	mov	%rdx, %r9
> +#endif
> +	andq	$-64, %rax
> +	pcmpeqb	(%rax), %xmm0
> +	pcmpeqb	16(%rax), %xmm1
> +	pcmpeqb	32(%rax), %xmm2
> +	pcmpeqb	48(%rax), %xmm3
> +	pmovmskb %xmm0, %esi
> +	pxor	%xmm0, %xmm0
> +	pmovmskb %xmm1, %edx
> +	pmovmskb %xmm2, %r10d
> +	pmovmskb %xmm3, %ecx
> +	sal	$16, %edx
> +	sal	$16, %ecx
> +	or	%esi, %edx
> +	or	%r10, %rcx
> +	salq	$32, %rcx
> +	orq	%rcx, %rdx
> +	mov	%edi, %ecx
> +#ifdef AS_STRNLEN
> +	salq	%cl, %r9
> +	or	%r9, %rdx
> +#endif
>  	sarq	%cl, %rdx
>  	test	%rdx, %rdx
>  	je	L(loop_init)
>  	bsfq	%rdx, %rax
>  	ret
> -#endif
>  	.p2align 4
>  L(loop_init):
> -	pxor	%xmm9, %xmm9
> -	pxor	%xmm10, %xmm10
> -	pxor	%xmm11, %xmm11
> +	movq	%rdi, %rax
> +	andq	$-64, %rax
>  #ifdef AS_STRNLEN
> +	add	%rdi, %r8
> +	sub	%rax, %r8
> +	cmp	$64, %r8
> +	je	L(return_noread)
> +#endif
> +	pxor	%xmm1, %xmm1
> +	pxor	%xmm2, %xmm2
> +#ifdef USE_AVX2
> +	vpxor	%xmm0, %xmm0, %xmm0
> +#endif
>  	.p2align 4
>  L(loop):
> +#ifdef USE_AVX2
> +	vmovdqa	64(%rax), %ymm1
> +	vpminub	96(%rax), %ymm1, %ymm2
> +	vpcmpeqb %ymm0, %ymm2, %ymm2
> +	vpmovmskb %ymm2, %edx
> +#else
> +	movdqa	64(%rax), %xmm5
> +	pminub	80(%rax), %xmm5
> +	pminub	96(%rax), %xmm5
> +	pminub	112(%rax), %xmm5
> +	pcmpeqb	%xmm0, %xmm5
> +	pmovmskb %xmm5, %edx
> +#endif
>  
> -	addq	$64, %rax
> -	cmpq	%rax, %r10
> -	je	L(exit_end)
> -
> -	movdqa	(%rax), %xmm8
> -	pminub	16(%rax), %xmm8
> -	pminub	32(%rax), %xmm8
> -	pminub	48(%rax), %xmm8
> -	pcmpeqb	%xmm11, %xmm8
> -	pmovmskb	%xmm8, %edx
> +#ifdef AS_STRNLEN
> +	sub	$64, %r8
>  	testl	%edx, %edx
> -	jne	L(exit)
> -	jmp	L(loop)
> -
> -	.p2align 4
> -L(exit_end):
> -	cmp	%rax, %r11
> -	je	L(first) /* Do not read when end is at page boundary.  */
> -	pxor	%xmm8, %xmm8
> -	FIND_ZERO
> -
> -L(first):
> -	bts	%r11, %rdx
> -	bsfq	%rdx, %rdx
> -	addq	%rdx, %rax
> -	subq	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(exit):
> -	pxor	%xmm8, %xmm8
> -	FIND_ZERO
> -
> -	bsfq	%rdx, %rdx
> -	addq	%rdx, %rax
> -	subq	%rdi, %rax
> -	ret
> -
> +	jne	L(exit64)
> +	cmp	$64, %r8
> +	jbe	L(exit64_zero)
>  #else
> -
> -	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
> -	.p2align 4
> -L(loop):
> -
> -	movdqa	64(%rax), %xmm8
> -	pminub	80(%rax), %xmm8
> -	pminub	96(%rax), %xmm8
> -	pminub	112(%rax), %xmm8
> -	pcmpeqb	%xmm11, %xmm8
> -	pmovmskb	%xmm8, %edx
>  	testl	%edx, %edx
>  	jne	L(exit64)
> +#endif
>  
>  	subq	$-128, %rax
> -
> -	movdqa	(%rax), %xmm8
> -	pminub	16(%rax), %xmm8
> -	pminub	32(%rax), %xmm8
> -	pminub	48(%rax), %xmm8
> -	pcmpeqb	%xmm11, %xmm8
> -	pmovmskb	%xmm8, %edx
> +#ifdef USE_AVX2
> +	vmovdqa	(%rax), %ymm1
> +	vpminub	32(%rax), %ymm1, %ymm2
> +	vpcmpeqb %ymm0, %ymm2, %ymm2
> +	vpmovmskb %ymm2, %edx
> +#else
> +	movdqa	(%rax), %xmm5
> +	pminub	16(%rax), %xmm5
> +	pminub	32(%rax), %xmm5
> +	pminub	48(%rax), %xmm5
> +	pcmpeqb	%xmm0, %xmm5
> +	pmovmskb %xmm5, %edx
> +#endif
> +#ifdef AS_STRNLEN
> +	sub	$64, %r8
>  	testl	%edx, %edx
>  	jne	L(exit0)
> +	cmp	$64, %r8
> +	jbe	L(exit0_zero)
> +#else
> +	testl	%edx, %edx
> +	jne	L(exit0)
> +#endif
>  	jmp	L(loop)
>  
> +#ifdef AS_STRNLEN
> +	.p2align 4
> +L(exit64_zero):
> +	addq	$64, %rax
> +L(exit0_zero):
> +	add	%r8, %rax
> +	sub	%rdi, %rax
> +	ret
> +#endif
>  	.p2align 4
> +
> +
>  L(exit64):
>  	addq	$64, %rax
>  L(exit0):
> -	pxor	%xmm8, %xmm8
> -	FIND_ZERO
> -
> +#ifdef USE_AVX2
> +	sal	$32, %rdx
> +#else
> +	sal	$48, %rdx
> +#endif
> +#ifdef AS_STRNLEN
> +	cmp	$64, %r8
> +	jae	L(dont_set2)
> +	bts	%r8, %rdx
> +	L(dont_set2):
> +#endif
> +#ifdef USE_AVX2
> +	subq    %rdi, %rax
> +	vpcmpeqb %ymm0, %ymm1, %ymm1
> +	vpmovmskb %ymm1, %ecx
> +	vzeroupper
> +	or	%rcx, %rdx
> +#else
> +	pcmpeqb	(%rax), %xmm0
> +	pcmpeqb	16(%rax), %xmm1
> +	pcmpeqb	32(%rax), %xmm2
> +	subq	%rdi, %rax
> +	pmovmskb %xmm0, %esi
> +	pmovmskb %xmm1, %ecx
> +	pmovmskb %xmm2, %r8d
> +	sal	$16, %ecx
> +	or	%esi, %ecx
> +	salq	$32, %r8
> +	orq	%r8, %rcx
> +	orq	%rcx, %rdx
> +#endif
>  	bsfq	%rdx, %rdx
>  	addq	%rdx, %rax
> -	subq	%rdi, %rax
>  	ret
> -
> -#endif
> -
>  END(strlen)
>  libc_hidden_builtin_def (strlen)
> -- 
> 1.8.4.rc3
  

Patch

diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index c382c8d..3e8beb0 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,5 +1,5 @@ 
 /* SSE2 version of strlen.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -18,222 +18,224 @@ 
 
 #include <sysdep.h>
 
-/* Long lived register in strlen(s), strnlen(s, n) are:
-
-	%xmm11 - zero
-	%rdi   - s
-	%r10  (s+n) & (~(64-1))
-	%r11   s+n
-*/
 
 
 .text
 ENTRY(strlen)
-
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
-#define FIND_ZERO	\
-	pcmpeqb	(%rax), %xmm8;	\
-	pcmpeqb	16(%rax), %xmm9;	\
-	pcmpeqb	32(%rax), %xmm10;	\
-	pcmpeqb	48(%rax), %xmm11;	\
-	pmovmskb	%xmm8, %esi;	\
-	pmovmskb	%xmm9, %edx;	\
-	pmovmskb	%xmm10, %r8d;	\
-	pmovmskb	%xmm11, %ecx;	\
-	salq	$16, %rdx;	\
-	salq	$16, %rcx;	\
-	orq	%rsi, %rdx;	\
-	orq	%r8, %rcx;	\
-	salq	$32, %rcx;	\
-	orq	%rcx, %rdx;
-
 #ifdef AS_STRNLEN
-/* Do not read anything when n==0.  */
+	mov	%rsi, %r8
+	xor	%edx, %edx
 	test	%rsi, %rsi
-	jne	L(n_nonzero)
-	xor	%rax, %rax
-	ret
-L(n_nonzero):
-
-/* Initialize long lived registers.  */
-
-	add	%rdi, %rsi
-	mov	%rsi, %r10
-	and	$-64, %r10
-	mov	%rsi, %r11
+	je	L(return_zero)
+	cmp	$64,  %rsi
+	jae	L(dont_set)
+	bts	%rsi, %rdx
+L(dont_set):
 #endif
-
-	pxor	%xmm8, %xmm8
-	pxor	%xmm9, %xmm9
-	pxor	%xmm10, %xmm10
-	pxor	%xmm11, %xmm11
-	movq	%rdi, %rax
-	movq	%rdi, %rcx
-	andq	$4095, %rcx
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
-	cmpq	$4047, %rcx
-/* We cannot unify this branching as it would be ~6 cycles slower.  */
+	pxor	%xmm0, %xmm0
+	mov	%edi, %ecx
+	and	$4095, %ecx
+	cmp	$4032, %ecx
 	ja	L(cross_page)
-
+	movdqu	(%rdi), %xmm4
+	pcmpeqb	%xmm0, %xmm4
+	pmovmskb %xmm4, %ecx
 #ifdef AS_STRNLEN
-/* Test if end is among first 64 bytes.  */
-# define STRNLEN_PROLOG	\
-	mov	%r11, %rsi;	\
-	subq	%rax, %rsi;	\
-	andq	$-64, %rax;	\
-	testq	$-64, %rsi;	\
-	je	L(strnlen_ret)
+	or	%dx, %cx
 #else
-# define STRNLEN_PROLOG  andq $-64, %rax;
+	test	%ecx, %ecx
 #endif
-
-/* Ignore bits in mask that come before start of string.  */
-#define PROLOG(lab)	\
-	movq	%rdi, %rcx;	\
-	xorq	%rax, %rcx;	\
-	STRNLEN_PROLOG;	\
-	sarq	%cl, %rdx;	\
-	test	%rdx, %rdx;	\
-	je	L(lab);	\
-	bsfq	%rdx, %rax;	\
+	je	L(next48_bytes)
+	bsf	%ecx, %eax
 	ret
 
 #ifdef AS_STRNLEN
-	andq	$-16, %rax
-	FIND_ZERO
-#else
-	/* Test first 16 bytes unaligned.  */
-	movdqu	(%rax), %xmm12
-	pcmpeqb	%xmm8, %xmm12
-	pmovmskb	%xmm12, %edx
-	test	%edx, %edx
-	je 	L(next48_bytes)
-	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+L(return_zero):
+	xor	%eax, %eax
 	ret
-
+L(return_noread):
+	add	$64, %rax
+	sub	%rdi, %rax
+	ret
+#endif
+	.p2align 4
 L(next48_bytes):
-/* Same as FIND_ZERO except we do not check first 16 bytes.  */
-	andq	$-16, %rax
-	pcmpeqb 16(%rax), %xmm9
-	pcmpeqb 32(%rax), %xmm10
-	pcmpeqb 48(%rax), %xmm11
-	pmovmskb	%xmm9, %edx
-	pmovmskb	%xmm10, %r8d
-	pmovmskb	%xmm11, %ecx
-	salq	$16, %rdx
-	salq	$16, %rcx
-	orq	%r8, %rcx
+	movdqu	16(%rdi), %xmm1
+	movdqu	32(%rdi), %xmm2
+	movdqu	48(%rdi), %xmm3
+	pcmpeqb	%xmm0, %xmm1
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm0, %xmm3
+#ifdef AS_STRNLEN
+	pmovmskb %xmm1, %ecx
+	sal	$16, %ecx
+	or	%rcx, %rdx
+#else
+	pmovmskb %xmm1, %edx
+	sal	$16, %edx
+#endif
+	pmovmskb %xmm2, %esi
+	pmovmskb %xmm3, %ecx
+	sal	$16, %ecx
+	or	%esi, %ecx
 	salq	$32, %rcx
 	orq	%rcx, %rdx
-#endif
-
-	/* When no zero byte is found xmm9-11 are zero so we do not have to
-	   zero them.  */
-	PROLOG(loop)
+	je	L(loop_init)
+	bsfq	%rdx, %rax
+	ret
 
 	.p2align 4
 L(cross_page):
-	andq	$-64, %rax
-	FIND_ZERO
-	PROLOG(loop_init)
 
+	movq	%rdi, %rax
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
 #ifdef AS_STRNLEN
-/* We must do this check to correctly handle strnlen (s, -1).  */
-L(strnlen_ret):
-	bts	%rsi, %rdx
+	mov	%rdx, %r9
+#endif
+	andq	$-64, %rax
+	pcmpeqb	(%rax), %xmm0
+	pcmpeqb	16(%rax), %xmm1
+	pcmpeqb	32(%rax), %xmm2
+	pcmpeqb	48(%rax), %xmm3
+	pmovmskb %xmm0, %esi
+	pxor	%xmm0, %xmm0
+	pmovmskb %xmm1, %edx
+	pmovmskb %xmm2, %r10d
+	pmovmskb %xmm3, %ecx
+	sal	$16, %edx
+	sal	$16, %ecx
+	or	%esi, %edx
+	or	%r10, %rcx
+	salq	$32, %rcx
+	orq	%rcx, %rdx
+	mov	%edi, %ecx
+#ifdef AS_STRNLEN
+	salq	%cl, %r9
+	or	%r9, %rdx
+#endif
 	sarq	%cl, %rdx
 	test	%rdx, %rdx
 	je	L(loop_init)
 	bsfq	%rdx, %rax
 	ret
-#endif
 	.p2align 4
 L(loop_init):
-	pxor	%xmm9, %xmm9
-	pxor	%xmm10, %xmm10
-	pxor	%xmm11, %xmm11
+	movq	%rdi, %rax
+	andq	$-64, %rax
 #ifdef AS_STRNLEN
+	add	%rdi, %r8
+	sub	%rax, %r8
+	cmp	$64, %r8
+	je	L(return_noread)
+#endif
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+#ifdef USE_AVX2
+	vpxor	%xmm0, %xmm0, %xmm0
+#endif
 	.p2align 4
 L(loop):
+#ifdef USE_AVX2
+	vmovdqa	64(%rax), %ymm1
+	vpminub	96(%rax), %ymm1, %ymm2
+	vpcmpeqb %ymm0, %ymm2, %ymm2
+	vpmovmskb %ymm2, %edx
+#else
+	movdqa	64(%rax), %xmm5
+	pminub	80(%rax), %xmm5
+	pminub	96(%rax), %xmm5
+	pminub	112(%rax), %xmm5
+	pcmpeqb	%xmm0, %xmm5
+	pmovmskb %xmm5, %edx
+#endif
 
-	addq	$64, %rax
-	cmpq	%rax, %r10
-	je	L(exit_end)
-
-	movdqa	(%rax), %xmm8
-	pminub	16(%rax), %xmm8
-	pminub	32(%rax), %xmm8
-	pminub	48(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+#ifdef AS_STRNLEN
+	sub	$64, %r8
 	testl	%edx, %edx
-	jne	L(exit)
-	jmp	L(loop)
-
-	.p2align 4
-L(exit_end):
-	cmp	%rax, %r11
-	je	L(first) /* Do not read when end is at page boundary.  */
-	pxor	%xmm8, %xmm8
-	FIND_ZERO
-
-L(first):
-	bts	%r11, %rdx
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
-	ret
-
-	.p2align 4
-L(exit):
-	pxor	%xmm8, %xmm8
-	FIND_ZERO
-
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
-	ret
-
+	jne	L(exit64)
+	cmp	$64, %r8
+	jbe	L(exit64_zero)
 #else
-
-	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
-	.p2align 4
-L(loop):
-
-	movdqa	64(%rax), %xmm8
-	pminub	80(%rax), %xmm8
-	pminub	96(%rax), %xmm8
-	pminub	112(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
 	testl	%edx, %edx
 	jne	L(exit64)
+#endif
 
 	subq	$-128, %rax
-
-	movdqa	(%rax), %xmm8
-	pminub	16(%rax), %xmm8
-	pminub	32(%rax), %xmm8
-	pminub	48(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+#ifdef USE_AVX2
+	vmovdqa	(%rax), %ymm1
+	vpminub	32(%rax), %ymm1, %ymm2
+	vpcmpeqb %ymm0, %ymm2, %ymm2
+	vpmovmskb %ymm2, %edx
+#else
+	movdqa	(%rax), %xmm5
+	pminub	16(%rax), %xmm5
+	pminub	32(%rax), %xmm5
+	pminub	48(%rax), %xmm5
+	pcmpeqb	%xmm0, %xmm5
+	pmovmskb %xmm5, %edx
+#endif
+#ifdef AS_STRNLEN
+	sub	$64, %r8
 	testl	%edx, %edx
 	jne	L(exit0)
+	cmp	$64, %r8
+	jbe	L(exit0_zero)
+#else
+	testl	%edx, %edx
+	jne	L(exit0)
+#endif
 	jmp	L(loop)
 
+#ifdef AS_STRNLEN
+	.p2align 4
+L(exit64_zero):
+	addq	$64, %rax
+L(exit0_zero):
+	add	%r8, %rax
+	sub	%rdi, %rax
+	ret
+#endif
 	.p2align 4
+
+
 L(exit64):
 	addq	$64, %rax
 L(exit0):
-	pxor	%xmm8, %xmm8
-	FIND_ZERO
-
+#ifdef USE_AVX2
+	sal	$32, %rdx
+#else
+	sal	$48, %rdx
+#endif
+#ifdef AS_STRNLEN
+	cmp	$64, %r8
+	jae	L(dont_set2)
+	bts	%r8, %rdx
+	L(dont_set2):
+#endif
+#ifdef USE_AVX2
+	subq    %rdi, %rax
+	vpcmpeqb %ymm0, %ymm1, %ymm1
+	vpmovmskb %ymm1, %ecx
+	vzeroupper
+	or	%rcx, %rdx
+#else
+	pcmpeqb	(%rax), %xmm0
+	pcmpeqb	16(%rax), %xmm1
+	pcmpeqb	32(%rax), %xmm2
+	subq	%rdi, %rax
+	pmovmskb %xmm0, %esi
+	pmovmskb %xmm1, %ecx
+	pmovmskb %xmm2, %r8d
+	sal	$16, %ecx
+	or	%esi, %ecx
+	salq	$32, %r8
+	orq	%r8, %rcx
+	orq	%rcx, %rdx
+#endif
 	bsfq	%rdx, %rdx
 	addq	%rdx, %rax
-	subq	%rdi, %rax
 	ret
-
-#endif
-
 END(strlen)
 libc_hidden_builtin_def (strlen)