[v3,1/2] x86: Optimize strchr-avx2.S

Message ID 20210422180403.422364-1-goldstein.w.n@gmail.com
State Superseded
Headers
Series [v3,1/2] x86: Optimize strchr-avx2.S |

Commit Message

Noah Goldstein April 22, 2021, 6:04 p.m. UTC
  No bug. This commit optimizes strchr-avx2.S. The optimizations are all
small things such as save an ALU in the alignment process, saving a
few instructions in the loop return, saving some bytes in the main
loop, and increasing the ILP in the return cases. test-strchr,
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++----------
 1 file changed, 173 insertions(+), 121 deletions(-)
  

Comments

H.J. Lu April 23, 2021, 4:56 p.m. UTC | #1
On Thu, Apr 22, 2021 at 02:04:02PM -0400, Noah Goldstein wrote:
> No bug. This commit optimizes strchr-avx2.S. The optimizations are all
> small things such as save an ALU in the alignment process, saving a
> few instructions in the loop return, saving some bytes in the main
> loop, and increasing the ILP in the return cases. test-strchr,
> test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
> 
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++----------
>  1 file changed, 173 insertions(+), 121 deletions(-)
> 
> diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> index 25bec38b5d..220165d2ba 100644
> --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> @@ -49,132 +49,144 @@
>  
>  	.section SECTION(.text),"ax",@progbits
>  ENTRY (STRCHR)
> -	movl	%edi, %ecx
> -# ifndef USE_AS_STRCHRNUL
> -	xorl	%edx, %edx
> -# endif
> -
>  	/* Broadcast CHAR to YMM0.	*/
>  	vmovd	%esi, %xmm0
> +	movl	%edi, %eax
> +	andl	$(PAGE_SIZE - 1), %eax
> +	VPBROADCAST	%xmm0, %ymm0
>  	vpxor	%xmm9, %xmm9, %xmm9
> -	VPBROADCAST %xmm0, %ymm0
>  
>  	/* Check if we cross page boundary with one vector load.  */
> -	andl	$(PAGE_SIZE - 1), %ecx
> -	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
> -	ja  L(cross_page_boundary)
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
> +	ja	L(cross_page_boundary)
>  
>  	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
>  	   null byte.  */
>  	vmovdqu	(%rdi), %ymm8
> -	VPCMPEQ %ymm8, %ymm0, %ymm1
> -	VPCMPEQ %ymm8, %ymm9, %ymm2
> +	VPCMPEQ	%ymm8, %ymm0, %ymm1
> +	VPCMPEQ	%ymm8, %ymm9, %ymm2
>  	vpor	%ymm1, %ymm2, %ymm1
> -	vpmovmskb %ymm1, %eax
> +	vpmovmskb	%ymm1, %eax

This change isn't needed.

>  	testl	%eax, %eax
> -	jz	L(more_vecs)
> +	jz	L(aligned_more)
>  	tzcntl	%eax, %eax
> -	/* Found CHAR or the null byte.	 */
> -	addq	%rdi, %rax
>  # ifndef USE_AS_STRCHRNUL
> -	cmp (%rax), %CHAR_REG
> -	cmovne	%rdx, %rax
> +	/* Found CHAR or the null byte.	 */
> +	cmp	(%rdi, %rax), %CHAR_REG
> +	jne	L(zero)
>  # endif
> -L(return_vzeroupper):
> -	ZERO_UPPER_VEC_REGISTERS_RETURN
> -
> -	.p2align 4
> -L(more_vecs):
> -	/* Align data for aligned loads in the loop.  */
> -	andq	$-VEC_SIZE, %rdi
> -L(aligned_more):
> -
> -	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
> -	   since data is only aligned to VEC_SIZE.	*/
> -	vmovdqa	VEC_SIZE(%rdi), %ymm8
> -	addq	$VEC_SIZE, %rdi
> -	VPCMPEQ %ymm8, %ymm0, %ymm1
> -	VPCMPEQ %ymm8, %ymm9, %ymm2
> -	vpor	%ymm1, %ymm2, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> -
> -	vmovdqa	VEC_SIZE(%rdi), %ymm8
> -	VPCMPEQ %ymm8, %ymm0, %ymm1
> -	VPCMPEQ %ymm8, %ymm9, %ymm2
> -	vpor	%ymm1, %ymm2, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x1)
> -
> -	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
> -	VPCMPEQ %ymm8, %ymm0, %ymm1
> -	VPCMPEQ %ymm8, %ymm9, %ymm2
> -	vpor	%ymm1, %ymm2, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x2)
> -
> -	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
> -	VPCMPEQ %ymm8, %ymm0, %ymm1
> -	VPCMPEQ %ymm8, %ymm9, %ymm2
> -	vpor	%ymm1, %ymm2, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jz	L(prep_loop_4x)
> +	addq	%rdi, %rax
> +	VZEROUPPER_RETURN
>  
> +	/* .p2align 5 helps keep performance more consistent if ENTRY()
> +	   alignment % 32 was either 16 or 0. As well this makes the
> +	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
> +	   easier.  */
> +	.p2align 5
> +L(first_vec_x4):
>  	tzcntl	%eax, %eax
> -	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
> +	addq	$(VEC_SIZE * 3 + 1), %rdi
>  # ifndef USE_AS_STRCHRNUL
> -	cmp (%rax), %CHAR_REG
> -	cmovne	%rdx, %rax
> +	/* Found CHAR or the null byte.	 */
> +	cmp	(%rdi, %rax), %CHAR_REG
> +	jne	L(zero)
>  # endif
> +	addq	%rdi, %rax
>  	VZEROUPPER_RETURN
>  
> -	.p2align 4
> -L(first_vec_x0):
> -	tzcntl	%eax, %eax
> -	/* Found CHAR or the null byte.	 */
> -	addq	%rdi, %rax
>  # ifndef USE_AS_STRCHRNUL
> -	cmp (%rax), %CHAR_REG
> -	cmovne	%rdx, %rax
> -# endif
> +L(zero):
> +	xorl	%eax, %eax
>  	VZEROUPPER_RETURN
> +# endif
> +
>  
>  	.p2align 4
>  L(first_vec_x1):
>  	tzcntl	%eax, %eax
> -	leaq	VEC_SIZE(%rdi, %rax), %rax
> +	incq	%rdi
>  # ifndef USE_AS_STRCHRNUL
> -	cmp (%rax), %CHAR_REG
> -	cmovne	%rdx, %rax
> +	/* Found CHAR or the null byte.	 */
> +	cmp	(%rdi, %rax), %CHAR_REG
> +	jne	L(zero)
>  # endif
> +	addq	%rdi, %rax
>  	VZEROUPPER_RETURN
>  
>  	.p2align 4
>  L(first_vec_x2):
>  	tzcntl	%eax, %eax
> +	addq	$(VEC_SIZE + 1), %rdi
> +# ifndef USE_AS_STRCHRNUL
>  	/* Found CHAR or the null byte.	 */
> -	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
> +	cmp	(%rdi, %rax), %CHAR_REG
> +	jne	L(zero)
> +# endif
> +	addq	%rdi, %rax
> +	VZEROUPPER_RETURN
> +
> +	.p2align 4
> +L(first_vec_x3):
> +	tzcntl	%eax, %eax
> +	addq	$(VEC_SIZE * 2 + 1), %rdi
>  # ifndef USE_AS_STRCHRNUL
> -	cmp (%rax), %CHAR_REG
> -	cmovne	%rdx, %rax
> +	/* Found CHAR or the null byte.	 */
> +	cmp	(%rdi, %rax), %CHAR_REG
> +	jne	L(zero)
>  # endif
> +	addq	%rdi, %rax
>  	VZEROUPPER_RETURN
>  
> -L(prep_loop_4x):
> -	/* Align data to 4 * VEC_SIZE.	*/
> -	andq	$-(VEC_SIZE * 4), %rdi
> +	.p2align 4
> +L(aligned_more):
> +	/* Align data to VEC_SIZE - 1. This is the same number of
> +	   instructions as using andq -VEC_SIZE but saves 4 bytes of code on
> +	   x4 check.  */
> +	orq	$(VEC_SIZE - 1), %rdi
> +L(cross_page_continue):
> +	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time since
> +	   data is only aligned to VEC_SIZE.  */
> +	vmovdqa	1(%rdi), %ymm8
> +	VPCMPEQ	%ymm8, %ymm0, %ymm1
> +	VPCMPEQ	%ymm8, %ymm9, %ymm2
> +	vpor	%ymm1, %ymm2, %ymm1
> +	vpmovmskb	%ymm1, %eax

Use a space, not tab, after vpmovmskb.

> +	testl	%eax, %eax
> +	jnz	L(first_vec_x1)
>  
> +	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
> +	VPCMPEQ	%ymm8, %ymm0, %ymm1
> +	VPCMPEQ	%ymm8, %ymm9, %ymm2
> +	vpor	%ymm1, %ymm2, %ymm1
> +	vpmovmskb	%ymm1, %eax

Use a space, not tab, after vpmovmskb.

> +	testl	%eax, %eax
> +	jnz	L(first_vec_x2)
> +
> +	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
> +	VPCMPEQ	%ymm8, %ymm0, %ymm1
> +	VPCMPEQ	%ymm8, %ymm9, %ymm2
> +	vpor	%ymm1, %ymm2, %ymm1
> +	vpmovmskb	%ymm1, %eax

Use a space, not tab, after vpmovmskb.

> +	testl	%eax, %eax
> +	jnz	L(first_vec_x3)
> +
> +	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
> +	VPCMPEQ	%ymm8, %ymm0, %ymm1
> +	VPCMPEQ	%ymm8, %ymm9, %ymm2
> +	vpor	%ymm1, %ymm2, %ymm1
> +	vpmovmskb	%ymm1, %eax

Use a space, not tab, after vpmovmskb.

> +	testl	%eax, %eax
> +	jnz	L(first_vec_x4)
> +	/* Align data to VEC_SIZE * 4 - 1.	*/
> +	addq	$(VEC_SIZE * 4 + 1), %rdi
> +	andq	$-(VEC_SIZE * 4), %rdi
>  	.p2align 4
>  L(loop_4x_vec):
>  	/* Compare 4 * VEC at a time forward.  */
> -	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
> -	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
> -	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
> -	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
> +	vmovdqa	(%rdi), %ymm5
> +	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
> +	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
> +	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
>  
>  	/* Leaves only CHARS matching esi as 0.	 */
>  	vpxor	%ymm5, %ymm0, %ymm1
> @@ -190,62 +202,102 @@ L(loop_4x_vec):
>  	VPMINU	%ymm1, %ymm2, %ymm5
>  	VPMINU	%ymm3, %ymm4, %ymm6
>  
> -	VPMINU	%ymm5, %ymm6, %ymm5
> +	VPMINU	%ymm5, %ymm6, %ymm6
>  
> -	VPCMPEQ %ymm5, %ymm9, %ymm5
> -	vpmovmskb %ymm5, %eax
> +	VPCMPEQ	%ymm6, %ymm9, %ymm6
> +	vpmovmskb	%ymm6, %ecx

Use a space, not tab, after vpmovmskb.

> +	subq	$-(VEC_SIZE * 4), %rdi
> +	testl	%ecx, %ecx
> +	jz	L(loop_4x_vec)
>  
> -	addq	$(VEC_SIZE * 4), %rdi
> -	testl	%eax, %eax
> -	jz  L(loop_4x_vec)
>  
> -	VPCMPEQ %ymm1, %ymm9, %ymm1
> -	vpmovmskb %ymm1, %eax
> +	VPCMPEQ	%ymm1, %ymm9, %ymm1
> +	vpmovmskb	%ymm1, %eax

Use a space, not tab, after vpmovmskb.

>  	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> +	jnz	L(last_vec_x0)
> +
>  
> -	VPCMPEQ %ymm2, %ymm9, %ymm2
> -	vpmovmskb %ymm2, %eax
> +	VPCMPEQ	%ymm5, %ymm9, %ymm2
> +	vpmovmskb	%ymm2, %eax

Use a space, not tab, after vpmovmskb.

>  	testl	%eax, %eax
> -	jnz	L(first_vec_x1)
> +	jnz	L(last_vec_x1)
> +
> +	VPCMPEQ	%ymm3, %ymm9, %ymm3
> +	vpmovmskb	%ymm3, %eax
> +	/* rcx has combined result from all 4 VEC. It will only be used if
> +	   the first 3 other VEC all did not contain a match.  */
> +	salq	$32, %rcx
> +	orq	%rcx, %rax
> +	tzcntq	%rax, %rax
> +	subq	$(VEC_SIZE * 2), %rdi
> +# ifndef USE_AS_STRCHRNUL
> +	/* Found CHAR or the null byte.	 */
> +	cmp	(%rdi, %rax), %CHAR_REG
> +	jne	L(zero_end)
> +# endif
> +	addq	%rdi, %rax
> +	VZEROUPPER_RETURN
> +
>  
> -	VPCMPEQ %ymm3, %ymm9, %ymm3
> -	VPCMPEQ %ymm4, %ymm9, %ymm4
> -	vpmovmskb %ymm3, %ecx
> -	vpmovmskb %ymm4, %eax
> -	salq	$32, %rax
> -	orq %rcx, %rax
> -	tzcntq  %rax, %rax
> -	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
> +	.p2align 4
> +L(last_vec_x0):
> +	tzcntl	%eax, %eax
> +	addq	$-(VEC_SIZE * 4), %rdi
>  # ifndef USE_AS_STRCHRNUL
> -	cmp (%rax), %CHAR_REG
> -	cmovne	%rdx, %rax
> +	/* Found CHAR or the null byte.	 */
> +	cmp	(%rdi, %rax), %CHAR_REG
> +	jne	L(zero_end)
>  # endif
> +	addq	%rdi, %rax
>  	VZEROUPPER_RETURN
>  
> +# ifndef USE_AS_STRCHRNUL
> +L(zero_end):
> +	xorl	%eax, %eax
> +	VZEROUPPER_RETURN
> +# endif
> +
> +	.p2align 4
> +L(last_vec_x1):
> +	tzcntl	%eax, %eax
> +	subq	$(VEC_SIZE * 3), %rdi
> +# ifndef USE_AS_STRCHRNUL
> +	/* Found CHAR or the null byte.	 */
> +	cmp	(%rdi, %rax), %CHAR_REG
> +	jne	L(zero_end)
> +# endif
> +	addq	%rdi, %rax
> +	VZEROUPPER_RETURN
> +
> +
>  	/* Cold case for crossing page with first load.	 */
>  	.p2align 4
>  L(cross_page_boundary):
> -	andq	$-VEC_SIZE, %rdi
> -	andl	$(VEC_SIZE - 1), %ecx
> -
> -	vmovdqa	(%rdi), %ymm8
> -	VPCMPEQ %ymm8, %ymm0, %ymm1
> -	VPCMPEQ %ymm8, %ymm9, %ymm2
> +	movq	%rdi, %rdx
> +	/* Align rdi to VEC_SIZE - 1.  */
> +	orq	$(VEC_SIZE - 1), %rdi
> +	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
> +	VPCMPEQ	%ymm8, %ymm0, %ymm1
> +	VPCMPEQ	%ymm8, %ymm9, %ymm2
>  	vpor	%ymm1, %ymm2, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	/* Remove the leading bits.	 */
> -	sarxl	%ecx, %eax, %eax
> +	vpmovmskb	%ymm1, %eax
> +	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> +	   so no need to manually mod edx.  */
> +	sarxl	%edx, %eax, %eax
>  	testl	%eax, %eax
> -	jz	L(aligned_more)
> +	jz	L(cross_page_continue)
>  	tzcntl	%eax, %eax
> -	addq	%rcx, %rdi
> -	addq	%rdi, %rax
>  # ifndef USE_AS_STRCHRNUL
> -	cmp (%rax), %CHAR_REG
> -	cmovne	%rdx, %rax
> +	xorl	%ecx, %ecx
> +	/* Found CHAR or the null byte.	 */
> +	cmp	(%rdx, %rax), %CHAR_REG
> +	leaq	(%rdx, %rax), %rax
> +	cmovne	%rcx, %rax
> +# else
> +	addq	%rdx, %rax
>  # endif
> -	VZEROUPPER_RETURN
> +L(return_vzeroupper):
> +	ZERO_UPPER_VEC_REGISTERS_RETURN
>  
>  END (STRCHR)
>  # endif
> -- 
> 2.29.2
> 

Thanks.

H.J.
  
Noah Goldstein April 23, 2021, 7:55 p.m. UTC | #2
On Fri, Apr 23, 2021 at 12:56 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Apr 22, 2021 at 02:04:02PM -0400, Noah Goldstein wrote:
> > No bug. This commit optimizes strchr-avx2.S. The optimizations are all
> > small things such as save an ALU in the alignment process, saving a
> > few instructions in the loop return, saving some bytes in the main
> > loop, and increasing the ILP in the return cases. test-strchr,
> > test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> >  sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++----------
> >  1 file changed, 173 insertions(+), 121 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > index 25bec38b5d..220165d2ba 100644
> > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > @@ -49,132 +49,144 @@
> >
> >       .section SECTION(.text),"ax",@progbits
> >  ENTRY (STRCHR)
> > -     movl    %edi, %ecx
> > -# ifndef USE_AS_STRCHRNUL
> > -     xorl    %edx, %edx
> > -# endif
> > -
> >       /* Broadcast CHAR to YMM0.      */
> >       vmovd   %esi, %xmm0
> > +     movl    %edi, %eax
> > +     andl    $(PAGE_SIZE - 1), %eax
> > +     VPBROADCAST     %xmm0, %ymm0
> >       vpxor   %xmm9, %xmm9, %xmm9
> > -     VPBROADCAST %xmm0, %ymm0
> >
> >       /* Check if we cross page boundary with one vector load.  */
> > -     andl    $(PAGE_SIZE - 1), %ecx
> > -     cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > -     ja  L(cross_page_boundary)
> > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +     ja      L(cross_page_boundary)
> >
> >       /* Check the first VEC_SIZE bytes.      Search for both CHAR and the
> >          null byte.  */
> >       vmovdqu (%rdi), %ymm8
> > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> >       vpor    %ymm1, %ymm2, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +     vpmovmskb       %ymm1, %eax
>
> This change isn't needed.

Fixed. For the future what is the rule for when to have a tab after
the instructions vs just a space?

>
> >       testl   %eax, %eax
> > -     jz      L(more_vecs)
> > +     jz      L(aligned_more)
> >       tzcntl  %eax, %eax
> > -     /* Found CHAR or the null byte.  */
> > -     addq    %rdi, %rax
> >  # ifndef USE_AS_STRCHRNUL
> > -     cmp (%rax), %CHAR_REG
> > -     cmovne  %rdx, %rax
> > +     /* Found CHAR or the null byte.  */
> > +     cmp     (%rdi, %rax), %CHAR_REG
> > +     jne     L(zero)
> >  # endif
> > -L(return_vzeroupper):
> > -     ZERO_UPPER_VEC_REGISTERS_RETURN
> > -
> > -     .p2align 4
> > -L(more_vecs):
> > -     /* Align data for aligned loads in the loop.  */
> > -     andq    $-VEC_SIZE, %rdi
> > -L(aligned_more):
> > -
> > -     /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > -        since data is only aligned to VEC_SIZE.      */
> > -     vmovdqa VEC_SIZE(%rdi), %ymm8
> > -     addq    $VEC_SIZE, %rdi
> > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > -     vpor    %ymm1, %ymm2, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > -
> > -     vmovdqa VEC_SIZE(%rdi), %ymm8
> > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > -     vpor    %ymm1, %ymm2, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x1)
> > -
> > -     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
> > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > -     vpor    %ymm1, %ymm2, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x2)
> > -
> > -     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > -     vpor    %ymm1, %ymm2, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jz      L(prep_loop_4x)
> > +     addq    %rdi, %rax
> > +     VZEROUPPER_RETURN
> >
> > +     /* .p2align 5 helps keep performance more consistent if ENTRY()
> > +        alignment % 32 was either 16 or 0. As well this makes the
> > +        alignment % 32 of the loop_4x_vec fixed which makes tuning it
> > +        easier.  */
> > +     .p2align 5
> > +L(first_vec_x4):
> >       tzcntl  %eax, %eax
> > -     leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> > +     addq    $(VEC_SIZE * 3 + 1), %rdi
> >  # ifndef USE_AS_STRCHRNUL
> > -     cmp (%rax), %CHAR_REG
> > -     cmovne  %rdx, %rax
> > +     /* Found CHAR or the null byte.  */
> > +     cmp     (%rdi, %rax), %CHAR_REG
> > +     jne     L(zero)
> >  # endif
> > +     addq    %rdi, %rax
> >       VZEROUPPER_RETURN
> >
> > -     .p2align 4
> > -L(first_vec_x0):
> > -     tzcntl  %eax, %eax
> > -     /* Found CHAR or the null byte.  */
> > -     addq    %rdi, %rax
> >  # ifndef USE_AS_STRCHRNUL
> > -     cmp (%rax), %CHAR_REG
> > -     cmovne  %rdx, %rax
> > -# endif
> > +L(zero):
> > +     xorl    %eax, %eax
> >       VZEROUPPER_RETURN
> > +# endif
> > +
> >
> >       .p2align 4
> >  L(first_vec_x1):
> >       tzcntl  %eax, %eax
> > -     leaq    VEC_SIZE(%rdi, %rax), %rax
> > +     incq    %rdi
> >  # ifndef USE_AS_STRCHRNUL
> > -     cmp (%rax), %CHAR_REG
> > -     cmovne  %rdx, %rax
> > +     /* Found CHAR or the null byte.  */
> > +     cmp     (%rdi, %rax), %CHAR_REG
> > +     jne     L(zero)
> >  # endif
> > +     addq    %rdi, %rax
> >       VZEROUPPER_RETURN
> >
> >       .p2align 4
> >  L(first_vec_x2):
> >       tzcntl  %eax, %eax
> > +     addq    $(VEC_SIZE + 1), %rdi
> > +# ifndef USE_AS_STRCHRNUL
> >       /* Found CHAR or the null byte.  */
> > -     leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > +     cmp     (%rdi, %rax), %CHAR_REG
> > +     jne     L(zero)
> > +# endif
> > +     addq    %rdi, %rax
> > +     VZEROUPPER_RETURN
> > +
> > +     .p2align 4
> > +L(first_vec_x3):
> > +     tzcntl  %eax, %eax
> > +     addq    $(VEC_SIZE * 2 + 1), %rdi
> >  # ifndef USE_AS_STRCHRNUL
> > -     cmp (%rax), %CHAR_REG
> > -     cmovne  %rdx, %rax
> > +     /* Found CHAR or the null byte.  */
> > +     cmp     (%rdi, %rax), %CHAR_REG
> > +     jne     L(zero)
> >  # endif
> > +     addq    %rdi, %rax
> >       VZEROUPPER_RETURN
> >
> > -L(prep_loop_4x):
> > -     /* Align data to 4 * VEC_SIZE.  */
> > -     andq    $-(VEC_SIZE * 4), %rdi
> > +     .p2align 4
> > +L(aligned_more):
> > +     /* Align data to VEC_SIZE - 1. This is the same number of
> > +        instructions as using andq -VEC_SIZE but saves 4 bytes of code on
> > +        x4 check.  */
> > +     orq     $(VEC_SIZE - 1), %rdi
> > +L(cross_page_continue):
> > +     /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time since
> > +        data is only aligned to VEC_SIZE.  */
> > +     vmovdqa 1(%rdi), %ymm8
> > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > +     vpor    %ymm1, %ymm2, %ymm1
> > +     vpmovmskb       %ymm1, %eax
>
> Use a space, not tab, after vpmovmskb.

Fixed.

>
> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x1)
> >
> > +     vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
> > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > +     vpor    %ymm1, %ymm2, %ymm1
> > +     vpmovmskb       %ymm1, %eax
>
> Use a space, not tab, after vpmovmskb.
>

Fixed.

> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x2)
> > +
> > +     vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
> > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > +     vpor    %ymm1, %ymm2, %ymm1
> > +     vpmovmskb       %ymm1, %eax
>
> Use a space, not tab, after vpmovmskb.
>

Fixed.

> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x3)
> > +
> > +     vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
> > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > +     vpor    %ymm1, %ymm2, %ymm1
> > +     vpmovmskb       %ymm1, %eax
>
> Use a space, not tab, after vpmovmskb.
>

Fixed.

> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x4)
> > +     /* Align data to VEC_SIZE * 4 - 1.      */
> > +     addq    $(VEC_SIZE * 4 + 1), %rdi
> > +     andq    $-(VEC_SIZE * 4), %rdi
> >       .p2align 4
> >  L(loop_4x_vec):
> >       /* Compare 4 * VEC at a time forward.  */
> > -     vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> > -     vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> > -     vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> > -     vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> > +     vmovdqa (%rdi), %ymm5
> > +     vmovdqa (VEC_SIZE)(%rdi), %ymm6
> > +     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > +     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> >
> >       /* Leaves only CHARS matching esi as 0.  */
> >       vpxor   %ymm5, %ymm0, %ymm1
> > @@ -190,62 +202,102 @@ L(loop_4x_vec):
> >       VPMINU  %ymm1, %ymm2, %ymm5
> >       VPMINU  %ymm3, %ymm4, %ymm6
> >
> > -     VPMINU  %ymm5, %ymm6, %ymm5
> > +     VPMINU  %ymm5, %ymm6, %ymm6
> >
> > -     VPCMPEQ %ymm5, %ymm9, %ymm5
> > -     vpmovmskb %ymm5, %eax
> > +     VPCMPEQ %ymm6, %ymm9, %ymm6
> > +     vpmovmskb       %ymm6, %ecx
>
> Use a space, not tab, after vpmovmskb.
>

Fixed.

> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     testl   %ecx, %ecx
> > +     jz      L(loop_4x_vec)
> >
> > -     addq    $(VEC_SIZE * 4), %rdi
> > -     testl   %eax, %eax
> > -     jz  L(loop_4x_vec)
> >
> > -     VPCMPEQ %ymm1, %ymm9, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +     VPCMPEQ %ymm1, %ymm9, %ymm1
> > +     vpmovmskb       %ymm1, %eax
>
> Use a space, not tab, after vpmovmskb.
>

Fixed.

> >       testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > +     jnz     L(last_vec_x0)
> > +
> >
> > -     VPCMPEQ %ymm2, %ymm9, %ymm2
> > -     vpmovmskb %ymm2, %eax
> > +     VPCMPEQ %ymm5, %ymm9, %ymm2
> > +     vpmovmskb       %ymm2, %eax
>
> Use a space, not tab, after vpmovmskb.
>

Fixed.

> >       testl   %eax, %eax
> > -     jnz     L(first_vec_x1)
> > +     jnz     L(last_vec_x1)
> > +
> > +     VPCMPEQ %ymm3, %ymm9, %ymm3
> > +     vpmovmskb       %ymm3, %eax
> > +     /* rcx has combined result from all 4 VEC. It will only be used if
> > +        the first 3 other VEC all did not contain a match.  */
> > +     salq    $32, %rcx
> > +     orq     %rcx, %rax
> > +     tzcntq  %rax, %rax
> > +     subq    $(VEC_SIZE * 2), %rdi
> > +# ifndef USE_AS_STRCHRNUL
> > +     /* Found CHAR or the null byte.  */
> > +     cmp     (%rdi, %rax), %CHAR_REG
> > +     jne     L(zero_end)
> > +# endif
> > +     addq    %rdi, %rax
> > +     VZEROUPPER_RETURN
> > +
> >
> > -     VPCMPEQ %ymm3, %ymm9, %ymm3
> > -     VPCMPEQ %ymm4, %ymm9, %ymm4
> > -     vpmovmskb %ymm3, %ecx
> > -     vpmovmskb %ymm4, %eax
> > -     salq    $32, %rax
> > -     orq %rcx, %rax
> > -     tzcntq  %rax, %rax
> > -     leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > +     .p2align 4
> > +L(last_vec_x0):
> > +     tzcntl  %eax, %eax
> > +     addq    $-(VEC_SIZE * 4), %rdi
> >  # ifndef USE_AS_STRCHRNUL
> > -     cmp (%rax), %CHAR_REG
> > -     cmovne  %rdx, %rax
> > +     /* Found CHAR or the null byte.  */
> > +     cmp     (%rdi, %rax), %CHAR_REG
> > +     jne     L(zero_end)
> >  # endif
> > +     addq    %rdi, %rax
> >       VZEROUPPER_RETURN
> >
> > +# ifndef USE_AS_STRCHRNUL
> > +L(zero_end):
> > +     xorl    %eax, %eax
> > +     VZEROUPPER_RETURN
> > +# endif
> > +
> > +     .p2align 4
> > +L(last_vec_x1):
> > +     tzcntl  %eax, %eax
> > +     subq    $(VEC_SIZE * 3), %rdi
> > +# ifndef USE_AS_STRCHRNUL
> > +     /* Found CHAR or the null byte.  */
> > +     cmp     (%rdi, %rax), %CHAR_REG
> > +     jne     L(zero_end)
> > +# endif
> > +     addq    %rdi, %rax
> > +     VZEROUPPER_RETURN
> > +
> > +
> >       /* Cold case for crossing page with first load.  */
> >       .p2align 4
> >  L(cross_page_boundary):
> > -     andq    $-VEC_SIZE, %rdi
> > -     andl    $(VEC_SIZE - 1), %ecx
> > -
> > -     vmovdqa (%rdi), %ymm8
> > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > +     movq    %rdi, %rdx
> > +     /* Align rdi to VEC_SIZE - 1.  */
> > +     orq     $(VEC_SIZE - 1), %rdi
> > +     vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
> > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> >       vpor    %ymm1, %ymm2, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     /* Remove the leading bits.      */
> > -     sarxl   %ecx, %eax, %eax
> > +     vpmovmskb       %ymm1, %eax
> > +     /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > +        so no need to manually mod edx.  */
> > +     sarxl   %edx, %eax, %eax
> >       testl   %eax, %eax
> > -     jz      L(aligned_more)
> > +     jz      L(cross_page_continue)
> >       tzcntl  %eax, %eax
> > -     addq    %rcx, %rdi
> > -     addq    %rdi, %rax
> >  # ifndef USE_AS_STRCHRNUL
> > -     cmp (%rax), %CHAR_REG
> > -     cmovne  %rdx, %rax
> > +     xorl    %ecx, %ecx
> > +     /* Found CHAR or the null byte.  */
> > +     cmp     (%rdx, %rax), %CHAR_REG
> > +     leaq    (%rdx, %rax), %rax
> > +     cmovne  %rcx, %rax
> > +# else
> > +     addq    %rdx, %rax
> >  # endif
> > -     VZEROUPPER_RETURN
> > +L(return_vzeroupper):
> > +     ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> >  END (STRCHR)
> >  # endif
> > --
> > 2.29.2
> >
>
> Thanks.
>
> H.J.
  
H.J. Lu April 23, 2021, 8:14 p.m. UTC | #3
On Fri, Apr 23, 2021 at 12:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Apr 23, 2021 at 12:56 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Thu, Apr 22, 2021 at 02:04:02PM -0400, Noah Goldstein wrote:
> > > No bug. This commit optimizes strchr-avx2.S. The optimizations are all
> > > small things such as save an ALU in the alignment process, saving a
> > > few instructions in the loop return, saving some bytes in the main
> > > loop, and increasing the ILP in the return cases. test-strchr,
> > > test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
> > >
> > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > ---
> > >  sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++----------
> > >  1 file changed, 173 insertions(+), 121 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > index 25bec38b5d..220165d2ba 100644
> > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > @@ -49,132 +49,144 @@
> > >
> > >       .section SECTION(.text),"ax",@progbits
> > >  ENTRY (STRCHR)
> > > -     movl    %edi, %ecx
> > > -# ifndef USE_AS_STRCHRNUL
> > > -     xorl    %edx, %edx
> > > -# endif
> > > -
> > >       /* Broadcast CHAR to YMM0.      */
> > >       vmovd   %esi, %xmm0
> > > +     movl    %edi, %eax
> > > +     andl    $(PAGE_SIZE - 1), %eax
> > > +     VPBROADCAST     %xmm0, %ymm0
> > >       vpxor   %xmm9, %xmm9, %xmm9
> > > -     VPBROADCAST %xmm0, %ymm0
> > >
> > >       /* Check if we cross page boundary with one vector load.  */
> > > -     andl    $(PAGE_SIZE - 1), %ecx
> > > -     cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > > -     ja  L(cross_page_boundary)
> > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > +     ja      L(cross_page_boundary)
> > >
> > >       /* Check the first VEC_SIZE bytes.      Search for both CHAR and the
> > >          null byte.  */
> > >       vmovdqu (%rdi), %ymm8
> > > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > >       vpor    %ymm1, %ymm2, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     vpmovmskb       %ymm1, %eax
> >
> > This change isn't needed.
>
> Fixed. For the future what is the rule for when to have a tab after
> the instructions vs just a space?

I prefer if the length of <INSN> is less than a tab,

<TAB><INSN><TAB><OPERANDS>

Otherwise

<TAB><INSN><SPACE><OPERANDS>

> >
> > >       testl   %eax, %eax
> > > -     jz      L(more_vecs)
> > > +     jz      L(aligned_more)
> > >       tzcntl  %eax, %eax
> > > -     /* Found CHAR or the null byte.  */
> > > -     addq    %rdi, %rax
> > >  # ifndef USE_AS_STRCHRNUL
> > > -     cmp (%rax), %CHAR_REG
> > > -     cmovne  %rdx, %rax
> > > +     /* Found CHAR or the null byte.  */
> > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > +     jne     L(zero)
> > >  # endif
> > > -L(return_vzeroupper):
> > > -     ZERO_UPPER_VEC_REGISTERS_RETURN
> > > -
> > > -     .p2align 4
> > > -L(more_vecs):
> > > -     /* Align data for aligned loads in the loop.  */
> > > -     andq    $-VEC_SIZE, %rdi
> > > -L(aligned_more):
> > > -
> > > -     /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > -        since data is only aligned to VEC_SIZE.      */
> > > -     vmovdqa VEC_SIZE(%rdi), %ymm8
> > > -     addq    $VEC_SIZE, %rdi
> > > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > -     vpor    %ymm1, %ymm2, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x0)
> > > -
> > > -     vmovdqa VEC_SIZE(%rdi), %ymm8
> > > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > -     vpor    %ymm1, %ymm2, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x1)
> > > -
> > > -     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
> > > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > -     vpor    %ymm1, %ymm2, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x2)
> > > -
> > > -     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > -     vpor    %ymm1, %ymm2, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jz      L(prep_loop_4x)
> > > +     addq    %rdi, %rax
> > > +     VZEROUPPER_RETURN
> > >
> > > +     /* .p2align 5 helps keep performance more consistent if ENTRY()
> > > +        alignment % 32 was either 16 or 0. As well this makes the
> > > +        alignment % 32 of the loop_4x_vec fixed which makes tuning it
> > > +        easier.  */
> > > +     .p2align 5
> > > +L(first_vec_x4):
> > >       tzcntl  %eax, %eax
> > > -     leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> > > +     addq    $(VEC_SIZE * 3 + 1), %rdi
> > >  # ifndef USE_AS_STRCHRNUL
> > > -     cmp (%rax), %CHAR_REG
> > > -     cmovne  %rdx, %rax
> > > +     /* Found CHAR or the null byte.  */
> > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > +     jne     L(zero)
> > >  # endif
> > > +     addq    %rdi, %rax
> > >       VZEROUPPER_RETURN
> > >
> > > -     .p2align 4
> > > -L(first_vec_x0):
> > > -     tzcntl  %eax, %eax
> > > -     /* Found CHAR or the null byte.  */
> > > -     addq    %rdi, %rax
> > >  # ifndef USE_AS_STRCHRNUL
> > > -     cmp (%rax), %CHAR_REG
> > > -     cmovne  %rdx, %rax
> > > -# endif
> > > +L(zero):
> > > +     xorl    %eax, %eax
> > >       VZEROUPPER_RETURN
> > > +# endif
> > > +
> > >
> > >       .p2align 4
> > >  L(first_vec_x1):
> > >       tzcntl  %eax, %eax
> > > -     leaq    VEC_SIZE(%rdi, %rax), %rax
> > > +     incq    %rdi
> > >  # ifndef USE_AS_STRCHRNUL
> > > -     cmp (%rax), %CHAR_REG
> > > -     cmovne  %rdx, %rax
> > > +     /* Found CHAR or the null byte.  */
> > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > +     jne     L(zero)
> > >  # endif
> > > +     addq    %rdi, %rax
> > >       VZEROUPPER_RETURN
> > >
> > >       .p2align 4
> > >  L(first_vec_x2):
> > >       tzcntl  %eax, %eax
> > > +     addq    $(VEC_SIZE + 1), %rdi
> > > +# ifndef USE_AS_STRCHRNUL
> > >       /* Found CHAR or the null byte.  */
> > > -     leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > +     jne     L(zero)
> > > +# endif
> > > +     addq    %rdi, %rax
> > > +     VZEROUPPER_RETURN
> > > +
> > > +     .p2align 4
> > > +L(first_vec_x3):
> > > +     tzcntl  %eax, %eax
> > > +     addq    $(VEC_SIZE * 2 + 1), %rdi
> > >  # ifndef USE_AS_STRCHRNUL
> > > -     cmp (%rax), %CHAR_REG
> > > -     cmovne  %rdx, %rax
> > > +     /* Found CHAR or the null byte.  */
> > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > +     jne     L(zero)
> > >  # endif
> > > +     addq    %rdi, %rax
> > >       VZEROUPPER_RETURN
> > >
> > > -L(prep_loop_4x):
> > > -     /* Align data to 4 * VEC_SIZE.  */
> > > -     andq    $-(VEC_SIZE * 4), %rdi
> > > +     .p2align 4
> > > +L(aligned_more):
> > > +     /* Align data to VEC_SIZE - 1. This is the same number of
> > > +        instructions as using andq -VEC_SIZE but saves 4 bytes of code on
> > > +        x4 check.  */
> > > +     orq     $(VEC_SIZE - 1), %rdi
> > > +L(cross_page_continue):
> > > +     /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time since
> > > +        data is only aligned to VEC_SIZE.  */
> > > +     vmovdqa 1(%rdi), %ymm8
> > > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > +     vpor    %ymm1, %ymm2, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> >
> > Use a space, not tab, after vpmovmskb.
>
> Fixed.
>
> >
> > > +     testl   %eax, %eax
> > > +     jnz     L(first_vec_x1)
> > >
> > > +     vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
> > > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > +     vpor    %ymm1, %ymm2, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> >
> > Use a space, not tab, after vpmovmskb.
> >
>
> Fixed.
>
> > > +     testl   %eax, %eax
> > > +     jnz     L(first_vec_x2)
> > > +
> > > +     vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
> > > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > +     vpor    %ymm1, %ymm2, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> >
> > Use a space, not tab, after vpmovmskb.
> >
>
> Fixed.
>
> > > +     testl   %eax, %eax
> > > +     jnz     L(first_vec_x3)
> > > +
> > > +     vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
> > > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > +     vpor    %ymm1, %ymm2, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> >
> > Use a space, not tab, after vpmovmskb.
> >
>
> Fixed.
>
> > > +     testl   %eax, %eax
> > > +     jnz     L(first_vec_x4)
> > > +     /* Align data to VEC_SIZE * 4 - 1.      */
> > > +     addq    $(VEC_SIZE * 4 + 1), %rdi
> > > +     andq    $-(VEC_SIZE * 4), %rdi
> > >       .p2align 4
> > >  L(loop_4x_vec):
> > >       /* Compare 4 * VEC at a time forward.  */
> > > -     vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> > > -     vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> > > -     vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> > > -     vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> > > +     vmovdqa (%rdi), %ymm5
> > > +     vmovdqa (VEC_SIZE)(%rdi), %ymm6
> > > +     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > > +     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > >
> > >       /* Leaves only CHARS matching esi as 0.  */
> > >       vpxor   %ymm5, %ymm0, %ymm1
> > > @@ -190,62 +202,102 @@ L(loop_4x_vec):
> > >       VPMINU  %ymm1, %ymm2, %ymm5
> > >       VPMINU  %ymm3, %ymm4, %ymm6
> > >
> > > -     VPMINU  %ymm5, %ymm6, %ymm5
> > > +     VPMINU  %ymm5, %ymm6, %ymm6
> > >
> > > -     VPCMPEQ %ymm5, %ymm9, %ymm5
> > > -     vpmovmskb %ymm5, %eax
> > > +     VPCMPEQ %ymm6, %ymm9, %ymm6
> > > +     vpmovmskb       %ymm6, %ecx
> >
> > Use a space, not tab, after vpmovmskb.
> >
>
> Fixed.
>
> > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > +     testl   %ecx, %ecx
> > > +     jz      L(loop_4x_vec)
> > >
> > > -     addq    $(VEC_SIZE * 4), %rdi
> > > -     testl   %eax, %eax
> > > -     jz  L(loop_4x_vec)
> > >
> > > -     VPCMPEQ %ymm1, %ymm9, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     VPCMPEQ %ymm1, %ymm9, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> >
> > Use a space, not tab, after vpmovmskb.
> >
>
> Fixed.
>
> > >       testl   %eax, %eax
> > > -     jnz     L(first_vec_x0)
> > > +     jnz     L(last_vec_x0)
> > > +
> > >
> > > -     VPCMPEQ %ymm2, %ymm9, %ymm2
> > > -     vpmovmskb %ymm2, %eax
> > > +     VPCMPEQ %ymm5, %ymm9, %ymm2
> > > +     vpmovmskb       %ymm2, %eax
> >
> > Use a space, not tab, after vpmovmskb.
> >
>
> Fixed.
>
> > >       testl   %eax, %eax
> > > -     jnz     L(first_vec_x1)
> > > +     jnz     L(last_vec_x1)
> > > +
> > > +     VPCMPEQ %ymm3, %ymm9, %ymm3
> > > +     vpmovmskb       %ymm3, %eax
> > > +     /* rcx has combined result from all 4 VEC. It will only be used if
> > > +        the first 3 other VEC all did not contain a match.  */
> > > +     salq    $32, %rcx
> > > +     orq     %rcx, %rax
> > > +     tzcntq  %rax, %rax
> > > +     subq    $(VEC_SIZE * 2), %rdi
> > > +# ifndef USE_AS_STRCHRNUL
> > > +     /* Found CHAR or the null byte.  */
> > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > +     jne     L(zero_end)
> > > +# endif
> > > +     addq    %rdi, %rax
> > > +     VZEROUPPER_RETURN
> > > +
> > >
> > > -     VPCMPEQ %ymm3, %ymm9, %ymm3
> > > -     VPCMPEQ %ymm4, %ymm9, %ymm4
> > > -     vpmovmskb %ymm3, %ecx
> > > -     vpmovmskb %ymm4, %eax
> > > -     salq    $32, %rax
> > > -     orq %rcx, %rax
> > > -     tzcntq  %rax, %rax
> > > -     leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > +     .p2align 4
> > > +L(last_vec_x0):
> > > +     tzcntl  %eax, %eax
> > > +     addq    $-(VEC_SIZE * 4), %rdi
> > >  # ifndef USE_AS_STRCHRNUL
> > > -     cmp (%rax), %CHAR_REG
> > > -     cmovne  %rdx, %rax
> > > +     /* Found CHAR or the null byte.  */
> > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > +     jne     L(zero_end)
> > >  # endif
> > > +     addq    %rdi, %rax
> > >       VZEROUPPER_RETURN
> > >
> > > +# ifndef USE_AS_STRCHRNUL
> > > +L(zero_end):
> > > +     xorl    %eax, %eax
> > > +     VZEROUPPER_RETURN
> > > +# endif
> > > +
> > > +     .p2align 4
> > > +L(last_vec_x1):
> > > +     tzcntl  %eax, %eax
> > > +     subq    $(VEC_SIZE * 3), %rdi
> > > +# ifndef USE_AS_STRCHRNUL
> > > +     /* Found CHAR or the null byte.  */
> > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > +     jne     L(zero_end)
> > > +# endif
> > > +     addq    %rdi, %rax
> > > +     VZEROUPPER_RETURN
> > > +
> > > +
> > >       /* Cold case for crossing page with first load.  */
> > >       .p2align 4
> > >  L(cross_page_boundary):
> > > -     andq    $-VEC_SIZE, %rdi
> > > -     andl    $(VEC_SIZE - 1), %ecx
> > > -
> > > -     vmovdqa (%rdi), %ymm8
> > > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > +     movq    %rdi, %rdx
> > > +     /* Align rdi to VEC_SIZE - 1.  */
> > > +     orq     $(VEC_SIZE - 1), %rdi
> > > +     vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
> > > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > >       vpor    %ymm1, %ymm2, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     /* Remove the leading bits.      */
> > > -     sarxl   %ecx, %eax, %eax
> > > +     vpmovmskb       %ymm1, %eax
> > > +     /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > > +        so no need to manually mod edx.  */
> > > +     sarxl   %edx, %eax, %eax
> > >       testl   %eax, %eax
> > > -     jz      L(aligned_more)
> > > +     jz      L(cross_page_continue)
> > >       tzcntl  %eax, %eax
> > > -     addq    %rcx, %rdi
> > > -     addq    %rdi, %rax
> > >  # ifndef USE_AS_STRCHRNUL
> > > -     cmp (%rax), %CHAR_REG
> > > -     cmovne  %rdx, %rax
> > > +     xorl    %ecx, %ecx
> > > +     /* Found CHAR or the null byte.  */
> > > +     cmp     (%rdx, %rax), %CHAR_REG
> > > +     leaq    (%rdx, %rax), %rax
> > > +     cmovne  %rcx, %rax
> > > +# else
> > > +     addq    %rdx, %rax
> > >  # endif
> > > -     VZEROUPPER_RETURN
> > > +L(return_vzeroupper):
> > > +     ZERO_UPPER_VEC_REGISTERS_RETURN
> > >
> > >  END (STRCHR)
> > >  # endif
> > > --
> > > 2.29.2
> > >
> >
> > Thanks.
> >
> > H.J.
  
Sunil Pandey April 27, 2022, 11:52 p.m. UTC | #4
On Fri, Apr 23, 2021 at 3:11 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Fri, Apr 23, 2021 at 12:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Apr 23, 2021 at 12:56 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Thu, Apr 22, 2021 at 02:04:02PM -0400, Noah Goldstein wrote:
> > > > No bug. This commit optimizes strchr-avx2.S. The optimizations are all
> > > > small things such as save an ALU in the alignment process, saving a
> > > > few instructions in the loop return, saving some bytes in the main
> > > > loop, and increasing the ILP in the return cases. test-strchr,
> > > > test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
> > > >
> > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > > ---
> > > >  sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++----------
> > > >  1 file changed, 173 insertions(+), 121 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > index 25bec38b5d..220165d2ba 100644
> > > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > @@ -49,132 +49,144 @@
> > > >
> > > >       .section SECTION(.text),"ax",@progbits
> > > >  ENTRY (STRCHR)
> > > > -     movl    %edi, %ecx
> > > > -# ifndef USE_AS_STRCHRNUL
> > > > -     xorl    %edx, %edx
> > > > -# endif
> > > > -
> > > >       /* Broadcast CHAR to YMM0.      */
> > > >       vmovd   %esi, %xmm0
> > > > +     movl    %edi, %eax
> > > > +     andl    $(PAGE_SIZE - 1), %eax
> > > > +     VPBROADCAST     %xmm0, %ymm0
> > > >       vpxor   %xmm9, %xmm9, %xmm9
> > > > -     VPBROADCAST %xmm0, %ymm0
> > > >
> > > >       /* Check if we cross page boundary with one vector load.  */
> > > > -     andl    $(PAGE_SIZE - 1), %ecx
> > > > -     cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > > > -     ja  L(cross_page_boundary)
> > > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > +     ja      L(cross_page_boundary)
> > > >
> > > >       /* Check the first VEC_SIZE bytes.      Search for both CHAR and the
> > > >          null byte.  */
> > > >       vmovdqu (%rdi), %ymm8
> > > > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > >       vpor    %ymm1, %ymm2, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > +     vpmovmskb       %ymm1, %eax
> > >
> > > This change isn't needed.
> >
> > Fixed. For the future what is the rule for when to have a tab after
> > the instructions vs just a space?
>
> I prefer if the length of <INSN> is less than a tab,
>
> <TAB><INSN><TAB><OPERANDS>
>
> Otherwise
>
> <TAB><INSN><SPACE><OPERANDS>
>
> > >
> > > >       testl   %eax, %eax
> > > > -     jz      L(more_vecs)
> > > > +     jz      L(aligned_more)
> > > >       tzcntl  %eax, %eax
> > > > -     /* Found CHAR or the null byte.  */
> > > > -     addq    %rdi, %rax
> > > >  # ifndef USE_AS_STRCHRNUL
> > > > -     cmp (%rax), %CHAR_REG
> > > > -     cmovne  %rdx, %rax
> > > > +     /* Found CHAR or the null byte.  */
> > > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > > +     jne     L(zero)
> > > >  # endif
> > > > -L(return_vzeroupper):
> > > > -     ZERO_UPPER_VEC_REGISTERS_RETURN
> > > > -
> > > > -     .p2align 4
> > > > -L(more_vecs):
> > > > -     /* Align data for aligned loads in the loop.  */
> > > > -     andq    $-VEC_SIZE, %rdi
> > > > -L(aligned_more):
> > > > -
> > > > -     /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > > -        since data is only aligned to VEC_SIZE.      */
> > > > -     vmovdqa VEC_SIZE(%rdi), %ymm8
> > > > -     addq    $VEC_SIZE, %rdi
> > > > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > -     vpor    %ymm1, %ymm2, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > -     jnz     L(first_vec_x0)
> > > > -
> > > > -     vmovdqa VEC_SIZE(%rdi), %ymm8
> > > > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > -     vpor    %ymm1, %ymm2, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > -     jnz     L(first_vec_x1)
> > > > -
> > > > -     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
> > > > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > -     vpor    %ymm1, %ymm2, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > -     jnz     L(first_vec_x2)
> > > > -
> > > > -     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > > > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > -     vpor    %ymm1, %ymm2, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > -     jz      L(prep_loop_4x)
> > > > +     addq    %rdi, %rax
> > > > +     VZEROUPPER_RETURN
> > > >
> > > > +     /* .p2align 5 helps keep performance more consistent if ENTRY()
> > > > +        alignment % 32 was either 16 or 0. As well this makes the
> > > > +        alignment % 32 of the loop_4x_vec fixed which makes tuning it
> > > > +        easier.  */
> > > > +     .p2align 5
> > > > +L(first_vec_x4):
> > > >       tzcntl  %eax, %eax
> > > > -     leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> > > > +     addq    $(VEC_SIZE * 3 + 1), %rdi
> > > >  # ifndef USE_AS_STRCHRNUL
> > > > -     cmp (%rax), %CHAR_REG
> > > > -     cmovne  %rdx, %rax
> > > > +     /* Found CHAR or the null byte.  */
> > > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > > +     jne     L(zero)
> > > >  # endif
> > > > +     addq    %rdi, %rax
> > > >       VZEROUPPER_RETURN
> > > >
> > > > -     .p2align 4
> > > > -L(first_vec_x0):
> > > > -     tzcntl  %eax, %eax
> > > > -     /* Found CHAR or the null byte.  */
> > > > -     addq    %rdi, %rax
> > > >  # ifndef USE_AS_STRCHRNUL
> > > > -     cmp (%rax), %CHAR_REG
> > > > -     cmovne  %rdx, %rax
> > > > -# endif
> > > > +L(zero):
> > > > +     xorl    %eax, %eax
> > > >       VZEROUPPER_RETURN
> > > > +# endif
> > > > +
> > > >
> > > >       .p2align 4
> > > >  L(first_vec_x1):
> > > >       tzcntl  %eax, %eax
> > > > -     leaq    VEC_SIZE(%rdi, %rax), %rax
> > > > +     incq    %rdi
> > > >  # ifndef USE_AS_STRCHRNUL
> > > > -     cmp (%rax), %CHAR_REG
> > > > -     cmovne  %rdx, %rax
> > > > +     /* Found CHAR or the null byte.  */
> > > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > > +     jne     L(zero)
> > > >  # endif
> > > > +     addq    %rdi, %rax
> > > >       VZEROUPPER_RETURN
> > > >
> > > >       .p2align 4
> > > >  L(first_vec_x2):
> > > >       tzcntl  %eax, %eax
> > > > +     addq    $(VEC_SIZE + 1), %rdi
> > > > +# ifndef USE_AS_STRCHRNUL
> > > >       /* Found CHAR or the null byte.  */
> > > > -     leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > > +     jne     L(zero)
> > > > +# endif
> > > > +     addq    %rdi, %rax
> > > > +     VZEROUPPER_RETURN
> > > > +
> > > > +     .p2align 4
> > > > +L(first_vec_x3):
> > > > +     tzcntl  %eax, %eax
> > > > +     addq    $(VEC_SIZE * 2 + 1), %rdi
> > > >  # ifndef USE_AS_STRCHRNUL
> > > > -     cmp (%rax), %CHAR_REG
> > > > -     cmovne  %rdx, %rax
> > > > +     /* Found CHAR or the null byte.  */
> > > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > > +     jne     L(zero)
> > > >  # endif
> > > > +     addq    %rdi, %rax
> > > >       VZEROUPPER_RETURN
> > > >
> > > > -L(prep_loop_4x):
> > > > -     /* Align data to 4 * VEC_SIZE.  */
> > > > -     andq    $-(VEC_SIZE * 4), %rdi
> > > > +     .p2align 4
> > > > +L(aligned_more):
> > > > +     /* Align data to VEC_SIZE - 1. This is the same number of
> > > > +        instructions as using andq -VEC_SIZE but saves 4 bytes of code on
> > > > +        x4 check.  */
> > > > +     orq     $(VEC_SIZE - 1), %rdi
> > > > +L(cross_page_continue):
> > > > +     /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time since
> > > > +        data is only aligned to VEC_SIZE.  */
> > > > +     vmovdqa 1(%rdi), %ymm8
> > > > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > +     vpor    %ymm1, %ymm2, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > >
> > > Use a space, not tab, after vpmovmskb.
> >
> > Fixed.
> >
> > >
> > > > +     testl   %eax, %eax
> > > > +     jnz     L(first_vec_x1)
> > > >
> > > > +     vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
> > > > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > +     vpor    %ymm1, %ymm2, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > >
> > > Use a space, not tab, after vpmovmskb.
> > >
> >
> > Fixed.
> >
> > > > +     testl   %eax, %eax
> > > > +     jnz     L(first_vec_x2)
> > > > +
> > > > +     vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
> > > > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > +     vpor    %ymm1, %ymm2, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > >
> > > Use a space, not tab, after vpmovmskb.
> > >
> >
> > Fixed.
> >
> > > > +     testl   %eax, %eax
> > > > +     jnz     L(first_vec_x3)
> > > > +
> > > > +     vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
> > > > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > +     vpor    %ymm1, %ymm2, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > >
> > > Use a space, not tab, after vpmovmskb.
> > >
> >
> > Fixed.
> >
> > > > +     testl   %eax, %eax
> > > > +     jnz     L(first_vec_x4)
> > > > +     /* Align data to VEC_SIZE * 4 - 1.      */
> > > > +     addq    $(VEC_SIZE * 4 + 1), %rdi
> > > > +     andq    $-(VEC_SIZE * 4), %rdi
> > > >       .p2align 4
> > > >  L(loop_4x_vec):
> > > >       /* Compare 4 * VEC at a time forward.  */
> > > > -     vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> > > > -     vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> > > > -     vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> > > > -     vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> > > > +     vmovdqa (%rdi), %ymm5
> > > > +     vmovdqa (VEC_SIZE)(%rdi), %ymm6
> > > > +     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > > > +     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > > >
> > > >       /* Leaves only CHARS matching esi as 0.  */
> > > >       vpxor   %ymm5, %ymm0, %ymm1
> > > > @@ -190,62 +202,102 @@ L(loop_4x_vec):
> > > >       VPMINU  %ymm1, %ymm2, %ymm5
> > > >       VPMINU  %ymm3, %ymm4, %ymm6
> > > >
> > > > -     VPMINU  %ymm5, %ymm6, %ymm5
> > > > +     VPMINU  %ymm5, %ymm6, %ymm6
> > > >
> > > > -     VPCMPEQ %ymm5, %ymm9, %ymm5
> > > > -     vpmovmskb %ymm5, %eax
> > > > +     VPCMPEQ %ymm6, %ymm9, %ymm6
> > > > +     vpmovmskb       %ymm6, %ecx
> > >
> > > Use a space, not tab, after vpmovmskb.
> > >
> >
> > Fixed.
> >
> > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > +     testl   %ecx, %ecx
> > > > +     jz      L(loop_4x_vec)
> > > >
> > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > -     testl   %eax, %eax
> > > > -     jz  L(loop_4x_vec)
> > > >
> > > > -     VPCMPEQ %ymm1, %ymm9, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > +     VPCMPEQ %ymm1, %ymm9, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > >
> > > Use a space, not tab, after vpmovmskb.
> > >
> >
> > Fixed.
> >
> > > >       testl   %eax, %eax
> > > > -     jnz     L(first_vec_x0)
> > > > +     jnz     L(last_vec_x0)
> > > > +
> > > >
> > > > -     VPCMPEQ %ymm2, %ymm9, %ymm2
> > > > -     vpmovmskb %ymm2, %eax
> > > > +     VPCMPEQ %ymm5, %ymm9, %ymm2
> > > > +     vpmovmskb       %ymm2, %eax
> > >
> > > Use a space, not tab, after vpmovmskb.
> > >
> >
> > Fixed.
> >
> > > >       testl   %eax, %eax
> > > > -     jnz     L(first_vec_x1)
> > > > +     jnz     L(last_vec_x1)
> > > > +
> > > > +     VPCMPEQ %ymm3, %ymm9, %ymm3
> > > > +     vpmovmskb       %ymm3, %eax
> > > > +     /* rcx has combined result from all 4 VEC. It will only be used if
> > > > +        the first 3 other VEC all did not contain a match.  */
> > > > +     salq    $32, %rcx
> > > > +     orq     %rcx, %rax
> > > > +     tzcntq  %rax, %rax
> > > > +     subq    $(VEC_SIZE * 2), %rdi
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +     /* Found CHAR or the null byte.  */
> > > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > > +     jne     L(zero_end)
> > > > +# endif
> > > > +     addq    %rdi, %rax
> > > > +     VZEROUPPER_RETURN
> > > > +
> > > >
> > > > -     VPCMPEQ %ymm3, %ymm9, %ymm3
> > > > -     VPCMPEQ %ymm4, %ymm9, %ymm4
> > > > -     vpmovmskb %ymm3, %ecx
> > > > -     vpmovmskb %ymm4, %eax
> > > > -     salq    $32, %rax
> > > > -     orq %rcx, %rax
> > > > -     tzcntq  %rax, %rax
> > > > -     leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > > +     .p2align 4
> > > > +L(last_vec_x0):
> > > > +     tzcntl  %eax, %eax
> > > > +     addq    $-(VEC_SIZE * 4), %rdi
> > > >  # ifndef USE_AS_STRCHRNUL
> > > > -     cmp (%rax), %CHAR_REG
> > > > -     cmovne  %rdx, %rax
> > > > +     /* Found CHAR or the null byte.  */
> > > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > > +     jne     L(zero_end)
> > > >  # endif
> > > > +     addq    %rdi, %rax
> > > >       VZEROUPPER_RETURN
> > > >
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +L(zero_end):
> > > > +     xorl    %eax, %eax
> > > > +     VZEROUPPER_RETURN
> > > > +# endif
> > > > +
> > > > +     .p2align 4
> > > > +L(last_vec_x1):
> > > > +     tzcntl  %eax, %eax
> > > > +     subq    $(VEC_SIZE * 3), %rdi
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +     /* Found CHAR or the null byte.  */
> > > > +     cmp     (%rdi, %rax), %CHAR_REG
> > > > +     jne     L(zero_end)
> > > > +# endif
> > > > +     addq    %rdi, %rax
> > > > +     VZEROUPPER_RETURN
> > > > +
> > > > +
> > > >       /* Cold case for crossing page with first load.  */
> > > >       .p2align 4
> > > >  L(cross_page_boundary):
> > > > -     andq    $-VEC_SIZE, %rdi
> > > > -     andl    $(VEC_SIZE - 1), %ecx
> > > > -
> > > > -     vmovdqa (%rdi), %ymm8
> > > > -     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > -     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > +     movq    %rdi, %rdx
> > > > +     /* Align rdi to VEC_SIZE - 1.  */
> > > > +     orq     $(VEC_SIZE - 1), %rdi
> > > > +     vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
> > > > +     VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > +     VPCMPEQ %ymm8, %ymm9, %ymm2
> > > >       vpor    %ymm1, %ymm2, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     /* Remove the leading bits.      */
> > > > -     sarxl   %ecx, %eax, %eax
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > > > +        so no need to manually mod edx.  */
> > > > +     sarxl   %edx, %eax, %eax
> > > >       testl   %eax, %eax
> > > > -     jz      L(aligned_more)
> > > > +     jz      L(cross_page_continue)
> > > >       tzcntl  %eax, %eax
> > > > -     addq    %rcx, %rdi
> > > > -     addq    %rdi, %rax
> > > >  # ifndef USE_AS_STRCHRNUL
> > > > -     cmp (%rax), %CHAR_REG
> > > > -     cmovne  %rdx, %rax
> > > > +     xorl    %ecx, %ecx
> > > > +     /* Found CHAR or the null byte.  */
> > > > +     cmp     (%rdx, %rax), %CHAR_REG
> > > > +     leaq    (%rdx, %rax), %rax
> > > > +     cmovne  %rcx, %rax
> > > > +# else
> > > > +     addq    %rdx, %rax
> > > >  # endif
> > > > -     VZEROUPPER_RETURN
> > > > +L(return_vzeroupper):
> > > > +     ZERO_UPPER_VEC_REGISTERS_RETURN
> > > >
> > > >  END (STRCHR)
> > > >  # endif
> > > > --
> > > > 2.29.2
> > > >
> > >
> > > Thanks.
> > >
> > > H.J.
>
>
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
  

Patch

diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index 25bec38b5d..220165d2ba 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -49,132 +49,144 @@ 
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRCHR)
-	movl	%edi, %ecx
-# ifndef USE_AS_STRCHRNUL
-	xorl	%edx, %edx
-# endif
-
 	/* Broadcast CHAR to YMM0.	*/
 	vmovd	%esi, %xmm0
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	VPBROADCAST	%xmm0, %ymm0
 	vpxor	%xmm9, %xmm9, %xmm9
-	VPBROADCAST %xmm0, %ymm0
 
 	/* Check if we cross page boundary with one vector load.  */
-	andl	$(PAGE_SIZE - 1), %ecx
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
-	ja  L(cross_page_boundary)
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
 	   null byte.  */
 	vmovdqu	(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
-	jz	L(more_vecs)
+	jz	L(aligned_more)
 	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
-	addq	%rdi, %rax
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
 # endif
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
-
-	.p2align 4
-L(more_vecs):
-	/* Align data for aligned loads in the loop.  */
-	andq	$-VEC_SIZE, %rdi
-L(aligned_more):
-
-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.	*/
-	vmovdqa	VEC_SIZE(%rdi), %ymm8
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	vmovdqa	VEC_SIZE(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jz	L(prep_loop_4x)
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
+	/* .p2align 5 helps keep performance more consistent if ENTRY()
+	   alignment % 32 was either 16 or 0. As well this makes the
+	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+	   easier.  */
+	.p2align 5
+L(first_vec_x4):
 	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
 # endif
+	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(first_vec_x0):
-	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
-	addq	%rdi, %rax
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
-# endif
+L(zero):
+	xorl	%eax, %eax
 	VZEROUPPER_RETURN
+# endif
+
 
 	.p2align 4
 L(first_vec_x1):
 	tzcntl	%eax, %eax
-	leaq	VEC_SIZE(%rdi, %rax), %rax
+	incq	%rdi
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
 # endif
+	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE + 1), %rdi
+# ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2 + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
 # endif
+	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-L(prep_loop_4x):
-	/* Align data to 4 * VEC_SIZE.	*/
-	andq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4
+L(aligned_more):
+	/* Align data to VEC_SIZE - 1. This is the same number of
+	   instructions as using andq -VEC_SIZE but saves 4 bytes of code on
+	   x4 check.  */
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time since
+	   data is only aligned to VEC_SIZE.  */
+	vmovdqa	1(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
 
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+	/* Align data to VEC_SIZE * 4 - 1.	*/
+	addq	$(VEC_SIZE * 4 + 1), %rdi
+	andq	$-(VEC_SIZE * 4), %rdi
 	.p2align 4
 L(loop_4x_vec):
 	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
-	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
-	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
-	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
+	vmovdqa	(%rdi), %ymm5
+	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 
 	/* Leaves only CHARS matching esi as 0.	 */
 	vpxor	%ymm5, %ymm0, %ymm1
@@ -190,62 +202,102 @@  L(loop_4x_vec):
 	VPMINU	%ymm1, %ymm2, %ymm5
 	VPMINU	%ymm3, %ymm4, %ymm6
 
-	VPMINU	%ymm5, %ymm6, %ymm5
+	VPMINU	%ymm5, %ymm6, %ymm6
 
-	VPCMPEQ %ymm5, %ymm9, %ymm5
-	vpmovmskb %ymm5, %eax
+	VPCMPEQ	%ymm6, %ymm9, %ymm6
+	vpmovmskb	%ymm6, %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
 
-	addq	$(VEC_SIZE * 4), %rdi
-	testl	%eax, %eax
-	jz  L(loop_4x_vec)
 
-	VPCMPEQ %ymm1, %ymm9, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm1, %ymm9, %ymm1
+	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	jnz	L(last_vec_x0)
+
 
-	VPCMPEQ %ymm2, %ymm9, %ymm2
-	vpmovmskb %ymm2, %eax
+	VPCMPEQ	%ymm5, %ymm9, %ymm2
+	vpmovmskb	%ymm2, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	jnz	L(last_vec_x1)
+
+	VPCMPEQ	%ymm3, %ymm9, %ymm3
+	vpmovmskb	%ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used if
+	   the first 3 other VEC all did not contain a match.  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	subq	$(VEC_SIZE * 2), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
 
-	VPCMPEQ %ymm3, %ymm9, %ymm3
-	VPCMPEQ %ymm4, %ymm9, %ymm4
-	vpmovmskb %ymm3, %ecx
-	vpmovmskb %ymm4, %eax
-	salq	$32, %rax
-	orq %rcx, %rax
-	tzcntq  %rax, %rax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	.p2align 4
+L(last_vec_x0):
+	tzcntl	%eax, %eax
+	addq	$-(VEC_SIZE * 4), %rdi
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
 # endif
+	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4
+L(last_vec_x1):
+	tzcntl	%eax, %eax
+	subq	$(VEC_SIZE * 3), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+
 	/* Cold case for crossing page with first load.	 */
 	.p2align 4
 L(cross_page_boundary):
-	andq	$-VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-
-	vmovdqa	(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
+	movq	%rdi, %rdx
+	/* Align rdi to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	/* Remove the leading bits.	 */
-	sarxl	%ecx, %eax, %eax
+	vpmovmskb	%ymm1, %eax
+	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+	   so no need to manually mod edx.  */
+	sarxl	%edx, %eax, %eax
 	testl	%eax, %eax
-	jz	L(aligned_more)
+	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
-	addq	%rcx, %rdi
-	addq	%rdi, %rax
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	xorl	%ecx, %ecx
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdx, %rax), %CHAR_REG
+	leaq	(%rdx, %rax), %rax
+	cmovne	%rcx, %rax
+# else
+	addq	%rdx, %rax
 # endif
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 END (STRCHR)
 # endif