[v4,1/2] x86: Refactor and improve performance of strchr-avx2.S

Message ID 20210203053900.4125403-1-goldstein.w.n@gmail.com
State Committed
Headers
Series [v4,1/2] x86: Refactor and improve performance of strchr-avx2.S |

Commit Message

develop--- via Libc-alpha Feb. 3, 2021, 5:38 a.m. UTC
  From: noah <goldstein.w.n@gmail.com>

No bug. Just seemed the performance could be improved a bit. Observed
and expected behavior are unchanged. Optimized body of main
loop. Updated page cross logic and optimized accordingly. Made a few
minor instruction selection modifications. No regressions in test
suite. Both test-strchrnul and test-strchr passed.

Signed-off-by: noah <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
 sysdeps/x86_64/multiarch/strchr.c      |   1 +
 2 files changed, 118 insertions(+), 118 deletions(-)
  

Comments

H.J. Lu Feb. 8, 2021, 2:08 p.m. UTC | #1
On Tue, Feb 2, 2021 at 9:39 PM <goldstein.w.n@gmail.com> wrote:
>
> From: noah <goldstein.w.n@gmail.com>
>
> No bug. Just seemed the performance could be improved a bit. Observed
> and expected behavior are unchanged. Optimized body of main
> loop. Updated page cross logic and optimized accordingly. Made a few
> minor instruction selection modifications. No regressions in test
> suite. Both test-strchrnul and test-strchr passed.
>
> Signed-off-by: noah <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
>  sysdeps/x86_64/multiarch/strchr.c      |   1 +
>  2 files changed, 118 insertions(+), 118 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> index d416558d04..8b9d78b55a 100644
> --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> @@ -27,10 +27,12 @@
>  # ifdef USE_AS_WCSCHR
>  #  define VPBROADCAST  vpbroadcastd
>  #  define VPCMPEQ      vpcmpeqd
> +#  define VPMINU       vpminud
>  #  define CHAR_REG     esi
>  # else
>  #  define VPBROADCAST  vpbroadcastb
>  #  define VPCMPEQ      vpcmpeqb
> +#  define VPMINU       vpminub
>  #  define CHAR_REG     sil
>  # endif
>
> @@ -39,20 +41,26 @@
>  # endif
>
>  # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>
>         .section .text.avx,"ax",@progbits
>  ENTRY (STRCHR)
>         movl    %edi, %ecx
> -       /* Broadcast CHAR to YMM0.  */
> +# ifndef USE_AS_STRCHRNUL
> +       xorl    %edx, %edx
> +# endif
> +
> +       /* Broadcast CHAR to YMM0.      */
>         vmovd   %esi, %xmm0
>         vpxor   %xmm9, %xmm9, %xmm9
>         VPBROADCAST %xmm0, %ymm0
> -       /* Check if we may cross page boundary with one vector load.  */
> -       andl    $(2 * VEC_SIZE - 1), %ecx
> -       cmpl    $VEC_SIZE, %ecx
> -       ja      L(cros_page_boundary)
> -
> -       /* Check the first VEC_SIZE bytes.  Search for both CHAR and the
> +
> +       /* Check if we cross page boundary with one vector load.  */
> +       andl    $(PAGE_SIZE - 1), %ecx
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> +       ja  L(cross_page_boundary)
> +
> +       /* Check the first VEC_SIZE bytes.      Search for both CHAR and the
>            null byte.  */
>         vmovdqu (%rdi), %ymm8
>         VPCMPEQ %ymm8, %ymm0, %ymm1
> @@ -60,50 +68,27 @@ ENTRY (STRCHR)
>         vpor    %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> -
> -       /* Align data for aligned loads in the loop.  */
> -       addq    $VEC_SIZE, %rdi
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> -
> -       jmp     L(more_4x_vec)
> -
> -       .p2align 4
> -L(cros_page_boundary):
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> -       vmovdqu (%rdi), %ymm8
> -       VPCMPEQ %ymm8, %ymm0, %ymm1
> -       VPCMPEQ %ymm8, %ymm9, %ymm2
> -       vpor    %ymm1, %ymm2, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       /* Remove the leading bytes.  */
> -       sarl    %cl, %eax
> -       testl   %eax, %eax
> -       jz      L(aligned_more)
> -       /* Found CHAR or the null byte.  */
> +       jz      L(more_vecs)
>         tzcntl  %eax, %eax
> -       addq    %rcx, %rax
> -# ifdef USE_AS_STRCHRNUL
> +       /* Found CHAR or the null byte.  */
>         addq    %rdi, %rax
> -# else
> -       xorl    %edx, %edx
> -       leaq    (%rdi, %rax), %rax
> -       cmp     (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> +       cmp (%rax), %CHAR_REG
>         cmovne  %rdx, %rax
>  # endif
>         VZEROUPPER
>         ret
>
>         .p2align 4
> +L(more_vecs):
> +       /* Align data for aligned loads in the loop.  */
> +       andq    $-VEC_SIZE, %rdi
>  L(aligned_more):
> -       addq    $VEC_SIZE, %rdi
>
> -L(more_4x_vec):
> -       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
> -       vmovdqa (%rdi), %ymm8
> +       /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> +          since data is only aligned to VEC_SIZE.      */
> +       vmovdqa VEC_SIZE(%rdi), %ymm8
> +       addq    $VEC_SIZE, %rdi
>         VPCMPEQ %ymm8, %ymm0, %ymm1
>         VPCMPEQ %ymm8, %ymm9, %ymm2
>         vpor    %ymm1, %ymm2, %ymm1
> @@ -125,7 +110,7 @@ L(more_4x_vec):
>         vpor    %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
> -       jnz     L(first_vec_x2)
> +       jnz     L(first_vec_x2)
>
>         vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
>         VPCMPEQ %ymm8, %ymm0, %ymm1
> @@ -133,122 +118,136 @@ L(more_4x_vec):
>         vpor    %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
> -       jnz     L(first_vec_x3)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> -
> -       /* Align data to 4 * VEC_SIZE.  */
> -       movq    %rdi, %rcx
> -       andl    $(4 * VEC_SIZE - 1), %ecx
> -       andq    $-(4 * VEC_SIZE), %rdi
> -
> -       .p2align 4
> -L(loop_4x_vec):
> -       /* Compare 4 * VEC at a time forward.  */
> -       vmovdqa (%rdi), %ymm5
> -       vmovdqa VEC_SIZE(%rdi), %ymm6
> -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> -
> -       VPCMPEQ %ymm5, %ymm0, %ymm1
> -       VPCMPEQ %ymm6, %ymm0, %ymm2
> -       VPCMPEQ %ymm7, %ymm0, %ymm3
> -       VPCMPEQ %ymm8, %ymm0, %ymm4
> -
> -       VPCMPEQ %ymm5, %ymm9, %ymm5
> -       VPCMPEQ %ymm6, %ymm9, %ymm6
> -       VPCMPEQ %ymm7, %ymm9, %ymm7
> -       VPCMPEQ %ymm8, %ymm9, %ymm8
> -
> -       vpor    %ymm1, %ymm5, %ymm1
> -       vpor    %ymm2, %ymm6, %ymm2
> -       vpor    %ymm3, %ymm7, %ymm3
> -       vpor    %ymm4, %ymm8, %ymm4
> -
> -       vpor    %ymm1, %ymm2, %ymm5
> -       vpor    %ymm3, %ymm4, %ymm6
> -
> -       vpor    %ymm5, %ymm6, %ymm5
> -
> -       vpmovmskb %ymm5, %eax
> -       testl   %eax, %eax
> -       jnz     L(4x_vec_end)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> +       jz      L(prep_loop_4x)
>
> -       jmp     L(loop_4x_vec)
> +       tzcntl  %eax, %eax
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> +# ifndef USE_AS_STRCHRNUL
> +       cmp (%rax), %CHAR_REG
> +       cmovne  %rdx, %rax
> +# endif
> +       VZEROUPPER
> +       ret
>
>         .p2align 4
>  L(first_vec_x0):
> -       /* Found CHAR or the null byte.  */
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> +       /* Found CHAR or the null byte.  */
>         addq    %rdi, %rax
> -# else
> -       xorl    %edx, %edx
> -       leaq    (%rdi, %rax), %rax
> -       cmp     (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> +       cmp (%rax), %CHAR_REG
>         cmovne  %rdx, %rax
>  # endif
>         VZEROUPPER
>         ret
> -
> +
>         .p2align 4
>  L(first_vec_x1):
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> -       addq    $VEC_SIZE, %rax
> -       addq    %rdi, %rax
> -# else
> -       xorl    %edx, %edx
>         leaq    VEC_SIZE(%rdi, %rax), %rax
> -       cmp     (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> +       cmp (%rax), %CHAR_REG
>         cmovne  %rdx, %rax
>  # endif
>         VZEROUPPER
> -       ret
> -
> +       ret
> +
>         .p2align 4
>  L(first_vec_x2):
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> -       addq    $(VEC_SIZE * 2), %rax
> -       addq    %rdi, %rax
> -# else
> -       xorl    %edx, %edx
> +       /* Found CHAR or the null byte.  */
>         leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> -       cmp     (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> +       cmp (%rax), %CHAR_REG
>         cmovne  %rdx, %rax
>  # endif
>         VZEROUPPER
>         ret
> +
> +L(prep_loop_4x):
> +       /* Align data to 4 * VEC_SIZE.  */
> +       andq    $-(VEC_SIZE * 4), %rdi
>
>         .p2align 4
> -L(4x_vec_end):
> +L(loop_4x_vec):
> +       /* Compare 4 * VEC at a time forward.  */
> +       vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> +       vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> +       vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> +       vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> +
> +       /* Leaves only CHARS matching esi as 0.  */
> +       vpxor   %ymm5, %ymm0, %ymm1
> +       vpxor   %ymm6, %ymm0, %ymm2
> +       vpxor   %ymm7, %ymm0, %ymm3
> +       vpxor   %ymm8, %ymm0, %ymm4
> +
> +       VPMINU  %ymm1, %ymm5, %ymm1
> +       VPMINU  %ymm2, %ymm6, %ymm2
> +       VPMINU  %ymm3, %ymm7, %ymm3
> +       VPMINU  %ymm4, %ymm8, %ymm4
> +
> +       VPMINU  %ymm1, %ymm2, %ymm5
> +       VPMINU  %ymm3, %ymm4, %ymm6
> +
> +       VPMINU  %ymm5, %ymm6, %ymm5
> +
> +       VPCMPEQ %ymm5, %ymm9, %ymm5
> +       vpmovmskb %ymm5, %eax
> +
> +       addq    $(VEC_SIZE * 4), %rdi
> +       testl   %eax, %eax
> +       jz  L(loop_4x_vec)
> +
> +       VPCMPEQ %ymm1, %ymm9, %ymm1
>         vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x0)
> +
> +       VPCMPEQ %ymm2, %ymm9, %ymm2
>         vpmovmskb %ymm2, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x1)
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x2)
> +
> +       VPCMPEQ %ymm3, %ymm9, %ymm3
> +       VPCMPEQ %ymm4, %ymm9, %ymm4
> +       vpmovmskb %ymm3, %ecx
>         vpmovmskb %ymm4, %eax
> +       salq    $32, %rax
> +       orq %rcx, %rax
> +       tzcntq  %rax, %rax
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> +# ifndef USE_AS_STRCHRNUL
> +       cmp (%rax), %CHAR_REG
> +       cmovne  %rdx, %rax
> +# endif
> +       VZEROUPPER
> +       ret
> +
> +       /* Cold case for crossing page with first load.  */
> +       .p2align 4
> +L(cross_page_boundary):
> +       andq    $-VEC_SIZE, %rdi
> +       andl    $(VEC_SIZE - 1), %ecx
> +
> +       vmovdqa (%rdi), %ymm8
> +       VPCMPEQ %ymm8, %ymm0, %ymm1
> +       VPCMPEQ %ymm8, %ymm9, %ymm2
> +       vpor    %ymm1, %ymm2, %ymm1
> +       vpmovmskb %ymm1, %eax
> +       /* Remove the leading bits.      */
> +       sarxl   %ecx, %eax, %eax
>         testl   %eax, %eax
> -L(first_vec_x3):
> +       jz      L(aligned_more)
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> -       addq    $(VEC_SIZE * 3), %rax
> +       addq    %rcx, %rdi
>         addq    %rdi, %rax
> -# else
> -       xorl    %edx, %edx
> -       leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> -       cmp     (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> +       cmp (%rax), %CHAR_REG
>         cmovne  %rdx, %rax
>  # endif
>         VZEROUPPER
>         ret
>
>  END (STRCHR)
> -#endif
> +# endif
> diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> index 583a152794..4dfbe3b58b 100644
> --- a/sysdeps/x86_64/multiarch/strchr.c
> +++ b/sysdeps/x86_64/multiarch/strchr.c
> @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
>
>    if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
>        && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
>        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
>      return OPTIMIZE (avx2);
>
> --
> 2.29.2
>

LGTM.

Thanks.
  
H.J. Lu Feb. 8, 2021, 7:33 p.m. UTC | #2
On Mon, Feb 8, 2021 at 6:08 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Feb 2, 2021 at 9:39 PM <goldstein.w.n@gmail.com> wrote:
> >
> > From: noah <goldstein.w.n@gmail.com>
> >
> > No bug. Just seemed the performance could be improved a bit. Observed
> > and expected behavior are unchanged. Optimized body of main
> > loop. Updated page cross logic and optimized accordingly. Made a few
> > minor instruction selection modifications. No regressions in test
> > suite. Both test-strchrnul and test-strchr passed.
> >
> > Signed-off-by: noah <goldstein.w.n@gmail.com>
> > ---
> >  sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
> >  sysdeps/x86_64/multiarch/strchr.c      |   1 +
> >  2 files changed, 118 insertions(+), 118 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > index d416558d04..8b9d78b55a 100644
> > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > @@ -27,10 +27,12 @@
> >  # ifdef USE_AS_WCSCHR
> >  #  define VPBROADCAST  vpbroadcastd
> >  #  define VPCMPEQ      vpcmpeqd
> > +#  define VPMINU       vpminud
> >  #  define CHAR_REG     esi
> >  # else
> >  #  define VPBROADCAST  vpbroadcastb
> >  #  define VPCMPEQ      vpcmpeqb
> > +#  define VPMINU       vpminub
> >  #  define CHAR_REG     sil
> >  # endif
> >
> > @@ -39,20 +41,26 @@
> >  # endif
> >
> >  # define VEC_SIZE 32
> > +# define PAGE_SIZE 4096
> >
> >         .section .text.avx,"ax",@progbits
> >  ENTRY (STRCHR)
> >         movl    %edi, %ecx
> > -       /* Broadcast CHAR to YMM0.  */
> > +# ifndef USE_AS_STRCHRNUL
> > +       xorl    %edx, %edx
> > +# endif
> > +
> > +       /* Broadcast CHAR to YMM0.      */
> >         vmovd   %esi, %xmm0
> >         vpxor   %xmm9, %xmm9, %xmm9
> >         VPBROADCAST %xmm0, %ymm0
> > -       /* Check if we may cross page boundary with one vector load.  */
> > -       andl    $(2 * VEC_SIZE - 1), %ecx
> > -       cmpl    $VEC_SIZE, %ecx
> > -       ja      L(cros_page_boundary)
> > -
> > -       /* Check the first VEC_SIZE bytes.  Search for both CHAR and the
> > +
> > +       /* Check if we cross page boundary with one vector load.  */
> > +       andl    $(PAGE_SIZE - 1), %ecx
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > +       ja  L(cross_page_boundary)
> > +
> > +       /* Check the first VEC_SIZE bytes.      Search for both CHAR and the
> >            null byte.  */
> >         vmovdqu (%rdi), %ymm8
> >         VPCMPEQ %ymm8, %ymm0, %ymm1
> > @@ -60,50 +68,27 @@ ENTRY (STRCHR)
> >         vpor    %ymm1, %ymm2, %ymm1
> >         vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> > -       jnz     L(first_vec_x0)
> > -
> > -       /* Align data for aligned loads in the loop.  */
> > -       addq    $VEC_SIZE, %rdi
> > -       andl    $(VEC_SIZE - 1), %ecx
> > -       andq    $-VEC_SIZE, %rdi
> > -
> > -       jmp     L(more_4x_vec)
> > -
> > -       .p2align 4
> > -L(cros_page_boundary):
> > -       andl    $(VEC_SIZE - 1), %ecx
> > -       andq    $-VEC_SIZE, %rdi
> > -       vmovdqu (%rdi), %ymm8
> > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > -       vpor    %ymm1, %ymm2, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -       /* Remove the leading bytes.  */
> > -       sarl    %cl, %eax
> > -       testl   %eax, %eax
> > -       jz      L(aligned_more)
> > -       /* Found CHAR or the null byte.  */
> > +       jz      L(more_vecs)
> >         tzcntl  %eax, %eax
> > -       addq    %rcx, %rax
> > -# ifdef USE_AS_STRCHRNUL
> > +       /* Found CHAR or the null byte.  */
> >         addq    %rdi, %rax
> > -# else
> > -       xorl    %edx, %edx
> > -       leaq    (%rdi, %rax), %rax
> > -       cmp     (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > +       cmp (%rax), %CHAR_REG
> >         cmovne  %rdx, %rax
> >  # endif
> >         VZEROUPPER
> >         ret
> >
> >         .p2align 4
> > +L(more_vecs):
> > +       /* Align data for aligned loads in the loop.  */
> > +       andq    $-VEC_SIZE, %rdi
> >  L(aligned_more):
> > -       addq    $VEC_SIZE, %rdi
> >
> > -L(more_4x_vec):
> > -       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > -          since data is only aligned to VEC_SIZE.  */
> > -       vmovdqa (%rdi), %ymm8
> > +       /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > +          since data is only aligned to VEC_SIZE.      */
> > +       vmovdqa VEC_SIZE(%rdi), %ymm8
> > +       addq    $VEC_SIZE, %rdi
> >         VPCMPEQ %ymm8, %ymm0, %ymm1
> >         VPCMPEQ %ymm8, %ymm9, %ymm2
> >         vpor    %ymm1, %ymm2, %ymm1
> > @@ -125,7 +110,7 @@ L(more_4x_vec):
> >         vpor    %ymm1, %ymm2, %ymm1
> >         vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> > -       jnz     L(first_vec_x2)
> > +       jnz     L(first_vec_x2)
> >
> >         vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> >         VPCMPEQ %ymm8, %ymm0, %ymm1
> > @@ -133,122 +118,136 @@ L(more_4x_vec):
> >         vpor    %ymm1, %ymm2, %ymm1
> >         vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> > -       jnz     L(first_vec_x3)
> > -
> > -       addq    $(VEC_SIZE * 4), %rdi
> > -
> > -       /* Align data to 4 * VEC_SIZE.  */
> > -       movq    %rdi, %rcx
> > -       andl    $(4 * VEC_SIZE - 1), %ecx
> > -       andq    $-(4 * VEC_SIZE), %rdi
> > -
> > -       .p2align 4
> > -L(loop_4x_vec):
> > -       /* Compare 4 * VEC at a time forward.  */
> > -       vmovdqa (%rdi), %ymm5
> > -       vmovdqa VEC_SIZE(%rdi), %ymm6
> > -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > -
> > -       VPCMPEQ %ymm5, %ymm0, %ymm1
> > -       VPCMPEQ %ymm6, %ymm0, %ymm2
> > -       VPCMPEQ %ymm7, %ymm0, %ymm3
> > -       VPCMPEQ %ymm8, %ymm0, %ymm4
> > -
> > -       VPCMPEQ %ymm5, %ymm9, %ymm5
> > -       VPCMPEQ %ymm6, %ymm9, %ymm6
> > -       VPCMPEQ %ymm7, %ymm9, %ymm7
> > -       VPCMPEQ %ymm8, %ymm9, %ymm8
> > -
> > -       vpor    %ymm1, %ymm5, %ymm1
> > -       vpor    %ymm2, %ymm6, %ymm2
> > -       vpor    %ymm3, %ymm7, %ymm3
> > -       vpor    %ymm4, %ymm8, %ymm4
> > -
> > -       vpor    %ymm1, %ymm2, %ymm5
> > -       vpor    %ymm3, %ymm4, %ymm6
> > -
> > -       vpor    %ymm5, %ymm6, %ymm5
> > -
> > -       vpmovmskb %ymm5, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(4x_vec_end)
> > -
> > -       addq    $(VEC_SIZE * 4), %rdi
> > +       jz      L(prep_loop_4x)
> >
> > -       jmp     L(loop_4x_vec)
> > +       tzcntl  %eax, %eax
> > +       leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> > +# ifndef USE_AS_STRCHRNUL
> > +       cmp (%rax), %CHAR_REG
> > +       cmovne  %rdx, %rax
> > +# endif
> > +       VZEROUPPER
> > +       ret
> >
> >         .p2align 4
> >  L(first_vec_x0):
> > -       /* Found CHAR or the null byte.  */
> >         tzcntl  %eax, %eax
> > -# ifdef USE_AS_STRCHRNUL
> > +       /* Found CHAR or the null byte.  */
> >         addq    %rdi, %rax
> > -# else
> > -       xorl    %edx, %edx
> > -       leaq    (%rdi, %rax), %rax
> > -       cmp     (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > +       cmp (%rax), %CHAR_REG
> >         cmovne  %rdx, %rax
> >  # endif
> >         VZEROUPPER
> >         ret
> > -
> > +
> >         .p2align 4
> >  L(first_vec_x1):
> >         tzcntl  %eax, %eax
> > -# ifdef USE_AS_STRCHRNUL
> > -       addq    $VEC_SIZE, %rax
> > -       addq    %rdi, %rax
> > -# else
> > -       xorl    %edx, %edx
> >         leaq    VEC_SIZE(%rdi, %rax), %rax
> > -       cmp     (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > +       cmp (%rax), %CHAR_REG
> >         cmovne  %rdx, %rax
> >  # endif
> >         VZEROUPPER
> > -       ret
> > -
> > +       ret
> > +
> >         .p2align 4
> >  L(first_vec_x2):
> >         tzcntl  %eax, %eax
> > -# ifdef USE_AS_STRCHRNUL
> > -       addq    $(VEC_SIZE * 2), %rax
> > -       addq    %rdi, %rax
> > -# else
> > -       xorl    %edx, %edx
> > +       /* Found CHAR or the null byte.  */
> >         leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > -       cmp     (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > +       cmp (%rax), %CHAR_REG
> >         cmovne  %rdx, %rax
> >  # endif
> >         VZEROUPPER
> >         ret
> > +
> > +L(prep_loop_4x):
> > +       /* Align data to 4 * VEC_SIZE.  */
> > +       andq    $-(VEC_SIZE * 4), %rdi
> >
> >         .p2align 4
> > -L(4x_vec_end):
> > +L(loop_4x_vec):
> > +       /* Compare 4 * VEC at a time forward.  */
> > +       vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> > +       vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> > +       vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> > +       vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> > +
> > +       /* Leaves only CHARS matching esi as 0.  */
> > +       vpxor   %ymm5, %ymm0, %ymm1
> > +       vpxor   %ymm6, %ymm0, %ymm2
> > +       vpxor   %ymm7, %ymm0, %ymm3
> > +       vpxor   %ymm8, %ymm0, %ymm4
> > +
> > +       VPMINU  %ymm1, %ymm5, %ymm1
> > +       VPMINU  %ymm2, %ymm6, %ymm2
> > +       VPMINU  %ymm3, %ymm7, %ymm3
> > +       VPMINU  %ymm4, %ymm8, %ymm4
> > +
> > +       VPMINU  %ymm1, %ymm2, %ymm5
> > +       VPMINU  %ymm3, %ymm4, %ymm6
> > +
> > +       VPMINU  %ymm5, %ymm6, %ymm5
> > +
> > +       VPCMPEQ %ymm5, %ymm9, %ymm5
> > +       vpmovmskb %ymm5, %eax
> > +
> > +       addq    $(VEC_SIZE * 4), %rdi
> > +       testl   %eax, %eax
> > +       jz  L(loop_4x_vec)
> > +
> > +       VPCMPEQ %ymm1, %ymm9, %ymm1
> >         vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x0)
> > +
> > +       VPCMPEQ %ymm2, %ymm9, %ymm2
> >         vpmovmskb %ymm2, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x1)
> > -       vpmovmskb %ymm3, %eax
> > -       testl   %eax, %eax
> > -       jnz     L(first_vec_x2)
> > +
> > +       VPCMPEQ %ymm3, %ymm9, %ymm3
> > +       VPCMPEQ %ymm4, %ymm9, %ymm4
> > +       vpmovmskb %ymm3, %ecx
> >         vpmovmskb %ymm4, %eax
> > +       salq    $32, %rax
> > +       orq %rcx, %rax
> > +       tzcntq  %rax, %rax
> > +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > +# ifndef USE_AS_STRCHRNUL
> > +       cmp (%rax), %CHAR_REG
> > +       cmovne  %rdx, %rax
> > +# endif
> > +       VZEROUPPER
> > +       ret
> > +
> > +       /* Cold case for crossing page with first load.  */
> > +       .p2align 4
> > +L(cross_page_boundary):
> > +       andq    $-VEC_SIZE, %rdi
> > +       andl    $(VEC_SIZE - 1), %ecx
> > +
> > +       vmovdqa (%rdi), %ymm8
> > +       VPCMPEQ %ymm8, %ymm0, %ymm1
> > +       VPCMPEQ %ymm8, %ymm9, %ymm2
> > +       vpor    %ymm1, %ymm2, %ymm1
> > +       vpmovmskb %ymm1, %eax
> > +       /* Remove the leading bits.      */
> > +       sarxl   %ecx, %eax, %eax
> >         testl   %eax, %eax
> > -L(first_vec_x3):
> > +       jz      L(aligned_more)
> >         tzcntl  %eax, %eax
> > -# ifdef USE_AS_STRCHRNUL
> > -       addq    $(VEC_SIZE * 3), %rax
> > +       addq    %rcx, %rdi
> >         addq    %rdi, %rax
> > -# else
> > -       xorl    %edx, %edx
> > -       leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> > -       cmp     (%rax), %CHAR_REG
> > +# ifndef USE_AS_STRCHRNUL
> > +       cmp (%rax), %CHAR_REG
> >         cmovne  %rdx, %rax
> >  # endif
> >         VZEROUPPER
> >         ret
> >
> >  END (STRCHR)
> > -#endif
> > +# endif
> > diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> > index 583a152794..4dfbe3b58b 100644
> > --- a/sysdeps/x86_64/multiarch/strchr.c
> > +++ b/sysdeps/x86_64/multiarch/strchr.c
> > @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
> >
> >    if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
> >        && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> >        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> >      return OPTIMIZE (avx2);
> >
> > --
> > 2.29.2
> >
>
> LGTM.
>
> Thanks.
>

This is the updated patch with extra white spaces fixed I am checking in.
  
Noah Goldstein Feb. 8, 2021, 7:48 p.m. UTC | #3
On Mon, Feb 8, 2021 at 2:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Feb 8, 2021 at 6:08 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Feb 2, 2021 at 9:39 PM <goldstein.w.n@gmail.com> wrote:
> > >
> > > From: noah <goldstein.w.n@gmail.com>
> > >
> > > No bug. Just seemed the performance could be improved a bit. Observed
> > > and expected behavior are unchanged. Optimized body of main
> > > loop. Updated page cross logic and optimized accordingly. Made a few
> > > minor instruction selection modifications. No regressions in test
> > > suite. Both test-strchrnul and test-strchr passed.
> > >
> > > Signed-off-by: noah <goldstein.w.n@gmail.com>
> > > ---
> > >  sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
> > >  sysdeps/x86_64/multiarch/strchr.c      |   1 +
> > >  2 files changed, 118 insertions(+), 118 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > index d416558d04..8b9d78b55a 100644
> > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > @@ -27,10 +27,12 @@
> > >  # ifdef USE_AS_WCSCHR
> > >  #  define VPBROADCAST  vpbroadcastd
> > >  #  define VPCMPEQ      vpcmpeqd
> > > +#  define VPMINU       vpminud
> > >  #  define CHAR_REG     esi
> > >  # else
> > >  #  define VPBROADCAST  vpbroadcastb
> > >  #  define VPCMPEQ      vpcmpeqb
> > > +#  define VPMINU       vpminub
> > >  #  define CHAR_REG     sil
> > >  # endif
> > >
> > > @@ -39,20 +41,26 @@
> > >  # endif
> > >
> > >  # define VEC_SIZE 32
> > > +# define PAGE_SIZE 4096
> > >
> > >         .section .text.avx,"ax",@progbits
> > >  ENTRY (STRCHR)
> > >         movl    %edi, %ecx
> > > -       /* Broadcast CHAR to YMM0.  */
> > > +# ifndef USE_AS_STRCHRNUL
> > > +       xorl    %edx, %edx
> > > +# endif
> > > +
> > > +       /* Broadcast CHAR to YMM0.      */
> > >         vmovd   %esi, %xmm0
> > >         vpxor   %xmm9, %xmm9, %xmm9
> > >         VPBROADCAST %xmm0, %ymm0
> > > -       /* Check if we may cross page boundary with one vector load.  */
> > > -       andl    $(2 * VEC_SIZE - 1), %ecx
> > > -       cmpl    $VEC_SIZE, %ecx
> > > -       ja      L(cros_page_boundary)
> > > -
> > > -       /* Check the first VEC_SIZE bytes.  Search for both CHAR and the
> > > +
> > > +       /* Check if we cross page boundary with one vector load.  */
> > > +       andl    $(PAGE_SIZE - 1), %ecx
> > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > > +       ja  L(cross_page_boundary)
> > > +
> > > +       /* Check the first VEC_SIZE bytes.      Search for both CHAR and the
> > >            null byte.  */
> > >         vmovdqu (%rdi), %ymm8
> > >         VPCMPEQ %ymm8, %ymm0, %ymm1
> > > @@ -60,50 +68,27 @@ ENTRY (STRCHR)
> > >         vpor    %ymm1, %ymm2, %ymm1
> > >         vpmovmskb %ymm1, %eax
> > >         testl   %eax, %eax
> > > -       jnz     L(first_vec_x0)
> > > -
> > > -       /* Align data for aligned loads in the loop.  */
> > > -       addq    $VEC_SIZE, %rdi
> > > -       andl    $(VEC_SIZE - 1), %ecx
> > > -       andq    $-VEC_SIZE, %rdi
> > > -
> > > -       jmp     L(more_4x_vec)
> > > -
> > > -       .p2align 4
> > > -L(cros_page_boundary):
> > > -       andl    $(VEC_SIZE - 1), %ecx
> > > -       andq    $-VEC_SIZE, %rdi
> > > -       vmovdqu (%rdi), %ymm8
> > > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > > -       vpor    %ymm1, %ymm2, %ymm1
> > > -       vpmovmskb %ymm1, %eax
> > > -       /* Remove the leading bytes.  */
> > > -       sarl    %cl, %eax
> > > -       testl   %eax, %eax
> > > -       jz      L(aligned_more)
> > > -       /* Found CHAR or the null byte.  */
> > > +       jz      L(more_vecs)
> > >         tzcntl  %eax, %eax
> > > -       addq    %rcx, %rax
> > > -# ifdef USE_AS_STRCHRNUL
> > > +       /* Found CHAR or the null byte.  */
> > >         addq    %rdi, %rax
> > > -# else
> > > -       xorl    %edx, %edx
> > > -       leaq    (%rdi, %rax), %rax
> > > -       cmp     (%rax), %CHAR_REG
> > > +# ifndef USE_AS_STRCHRNUL
> > > +       cmp (%rax), %CHAR_REG
> > >         cmovne  %rdx, %rax
> > >  # endif
> > >         VZEROUPPER
> > >         ret
> > >
> > >         .p2align 4
> > > +L(more_vecs):
> > > +       /* Align data for aligned loads in the loop.  */
> > > +       andq    $-VEC_SIZE, %rdi
> > >  L(aligned_more):
> > > -       addq    $VEC_SIZE, %rdi
> > >
> > > -L(more_4x_vec):
> > > -       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > -          since data is only aligned to VEC_SIZE.  */
> > > -       vmovdqa (%rdi), %ymm8
> > > +       /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > +          since data is only aligned to VEC_SIZE.      */
> > > +       vmovdqa VEC_SIZE(%rdi), %ymm8
> > > +       addq    $VEC_SIZE, %rdi
> > >         VPCMPEQ %ymm8, %ymm0, %ymm1
> > >         VPCMPEQ %ymm8, %ymm9, %ymm2
> > >         vpor    %ymm1, %ymm2, %ymm1
> > > @@ -125,7 +110,7 @@ L(more_4x_vec):
> > >         vpor    %ymm1, %ymm2, %ymm1
> > >         vpmovmskb %ymm1, %eax
> > >         testl   %eax, %eax
> > > -       jnz     L(first_vec_x2)
> > > +       jnz     L(first_vec_x2)
> > >
> > >         vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > >         VPCMPEQ %ymm8, %ymm0, %ymm1
> > > @@ -133,122 +118,136 @@ L(more_4x_vec):
> > >         vpor    %ymm1, %ymm2, %ymm1
> > >         vpmovmskb %ymm1, %eax
> > >         testl   %eax, %eax
> > > -       jnz     L(first_vec_x3)
> > > -
> > > -       addq    $(VEC_SIZE * 4), %rdi
> > > -
> > > -       /* Align data to 4 * VEC_SIZE.  */
> > > -       movq    %rdi, %rcx
> > > -       andl    $(4 * VEC_SIZE - 1), %ecx
> > > -       andq    $-(4 * VEC_SIZE), %rdi
> > > -
> > > -       .p2align 4
> > > -L(loop_4x_vec):
> > > -       /* Compare 4 * VEC at a time forward.  */
> > > -       vmovdqa (%rdi), %ymm5
> > > -       vmovdqa VEC_SIZE(%rdi), %ymm6
> > > -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > > -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > > -
> > > -       VPCMPEQ %ymm5, %ymm0, %ymm1
> > > -       VPCMPEQ %ymm6, %ymm0, %ymm2
> > > -       VPCMPEQ %ymm7, %ymm0, %ymm3
> > > -       VPCMPEQ %ymm8, %ymm0, %ymm4
> > > -
> > > -       VPCMPEQ %ymm5, %ymm9, %ymm5
> > > -       VPCMPEQ %ymm6, %ymm9, %ymm6
> > > -       VPCMPEQ %ymm7, %ymm9, %ymm7
> > > -       VPCMPEQ %ymm8, %ymm9, %ymm8
> > > -
> > > -       vpor    %ymm1, %ymm5, %ymm1
> > > -       vpor    %ymm2, %ymm6, %ymm2
> > > -       vpor    %ymm3, %ymm7, %ymm3
> > > -       vpor    %ymm4, %ymm8, %ymm4
> > > -
> > > -       vpor    %ymm1, %ymm2, %ymm5
> > > -       vpor    %ymm3, %ymm4, %ymm6
> > > -
> > > -       vpor    %ymm5, %ymm6, %ymm5
> > > -
> > > -       vpmovmskb %ymm5, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(4x_vec_end)
> > > -
> > > -       addq    $(VEC_SIZE * 4), %rdi
> > > +       jz      L(prep_loop_4x)
> > >
> > > -       jmp     L(loop_4x_vec)
> > > +       tzcntl  %eax, %eax
> > > +       leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> > > +# ifndef USE_AS_STRCHRNUL
> > > +       cmp (%rax), %CHAR_REG
> > > +       cmovne  %rdx, %rax
> > > +# endif
> > > +       VZEROUPPER
> > > +       ret
> > >
> > >         .p2align 4
> > >  L(first_vec_x0):
> > > -       /* Found CHAR or the null byte.  */
> > >         tzcntl  %eax, %eax
> > > -# ifdef USE_AS_STRCHRNUL
> > > +       /* Found CHAR or the null byte.  */
> > >         addq    %rdi, %rax
> > > -# else
> > > -       xorl    %edx, %edx
> > > -       leaq    (%rdi, %rax), %rax
> > > -       cmp     (%rax), %CHAR_REG
> > > +# ifndef USE_AS_STRCHRNUL
> > > +       cmp (%rax), %CHAR_REG
> > >         cmovne  %rdx, %rax
> > >  # endif
> > >         VZEROUPPER
> > >         ret
> > > -
> > > +
> > >         .p2align 4
> > >  L(first_vec_x1):
> > >         tzcntl  %eax, %eax
> > > -# ifdef USE_AS_STRCHRNUL
> > > -       addq    $VEC_SIZE, %rax
> > > -       addq    %rdi, %rax
> > > -# else
> > > -       xorl    %edx, %edx
> > >         leaq    VEC_SIZE(%rdi, %rax), %rax
> > > -       cmp     (%rax), %CHAR_REG
> > > +# ifndef USE_AS_STRCHRNUL
> > > +       cmp (%rax), %CHAR_REG
> > >         cmovne  %rdx, %rax
> > >  # endif
> > >         VZEROUPPER
> > > -       ret
> > > -
> > > +       ret
> > > +
> > >         .p2align 4
> > >  L(first_vec_x2):
> > >         tzcntl  %eax, %eax
> > > -# ifdef USE_AS_STRCHRNUL
> > > -       addq    $(VEC_SIZE * 2), %rax
> > > -       addq    %rdi, %rax
> > > -# else
> > > -       xorl    %edx, %edx
> > > +       /* Found CHAR or the null byte.  */
> > >         leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > -       cmp     (%rax), %CHAR_REG
> > > +# ifndef USE_AS_STRCHRNUL
> > > +       cmp (%rax), %CHAR_REG
> > >         cmovne  %rdx, %rax
> > >  # endif
> > >         VZEROUPPER
> > >         ret
> > > +
> > > +L(prep_loop_4x):
> > > +       /* Align data to 4 * VEC_SIZE.  */
> > > +       andq    $-(VEC_SIZE * 4), %rdi
> > >
> > >         .p2align 4
> > > -L(4x_vec_end):
> > > +L(loop_4x_vec):
> > > +       /* Compare 4 * VEC at a time forward.  */
> > > +       vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> > > +       vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> > > +       vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> > > +       vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> > > +
> > > +       /* Leaves only CHARS matching esi as 0.  */
> > > +       vpxor   %ymm5, %ymm0, %ymm1
> > > +       vpxor   %ymm6, %ymm0, %ymm2
> > > +       vpxor   %ymm7, %ymm0, %ymm3
> > > +       vpxor   %ymm8, %ymm0, %ymm4
> > > +
> > > +       VPMINU  %ymm1, %ymm5, %ymm1
> > > +       VPMINU  %ymm2, %ymm6, %ymm2
> > > +       VPMINU  %ymm3, %ymm7, %ymm3
> > > +       VPMINU  %ymm4, %ymm8, %ymm4
> > > +
> > > +       VPMINU  %ymm1, %ymm2, %ymm5
> > > +       VPMINU  %ymm3, %ymm4, %ymm6
> > > +
> > > +       VPMINU  %ymm5, %ymm6, %ymm5
> > > +
> > > +       VPCMPEQ %ymm5, %ymm9, %ymm5
> > > +       vpmovmskb %ymm5, %eax
> > > +
> > > +       addq    $(VEC_SIZE * 4), %rdi
> > > +       testl   %eax, %eax
> > > +       jz  L(loop_4x_vec)
> > > +
> > > +       VPCMPEQ %ymm1, %ymm9, %ymm1
> > >         vpmovmskb %ymm1, %eax
> > >         testl   %eax, %eax
> > >         jnz     L(first_vec_x0)
> > > +
> > > +       VPCMPEQ %ymm2, %ymm9, %ymm2
> > >         vpmovmskb %ymm2, %eax
> > >         testl   %eax, %eax
> > >         jnz     L(first_vec_x1)
> > > -       vpmovmskb %ymm3, %eax
> > > -       testl   %eax, %eax
> > > -       jnz     L(first_vec_x2)
> > > +
> > > +       VPCMPEQ %ymm3, %ymm9, %ymm3
> > > +       VPCMPEQ %ymm4, %ymm9, %ymm4
> > > +       vpmovmskb %ymm3, %ecx
> > >         vpmovmskb %ymm4, %eax
> > > +       salq    $32, %rax
> > > +       orq %rcx, %rax
> > > +       tzcntq  %rax, %rax
> > > +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > +# ifndef USE_AS_STRCHRNUL
> > > +       cmp (%rax), %CHAR_REG
> > > +       cmovne  %rdx, %rax
> > > +# endif
> > > +       VZEROUPPER
> > > +       ret
> > > +
> > > +       /* Cold case for crossing page with first load.  */
> > > +       .p2align 4
> > > +L(cross_page_boundary):
> > > +       andq    $-VEC_SIZE, %rdi
> > > +       andl    $(VEC_SIZE - 1), %ecx
> > > +
> > > +       vmovdqa (%rdi), %ymm8
> > > +       VPCMPEQ %ymm8, %ymm0, %ymm1
> > > +       VPCMPEQ %ymm8, %ymm9, %ymm2
> > > +       vpor    %ymm1, %ymm2, %ymm1
> > > +       vpmovmskb %ymm1, %eax
> > > +       /* Remove the leading bits.      */
> > > +       sarxl   %ecx, %eax, %eax
> > >         testl   %eax, %eax
> > > -L(first_vec_x3):
> > > +       jz      L(aligned_more)
> > >         tzcntl  %eax, %eax
> > > -# ifdef USE_AS_STRCHRNUL
> > > -       addq    $(VEC_SIZE * 3), %rax
> > > +       addq    %rcx, %rdi
> > >         addq    %rdi, %rax
> > > -# else
> > > -       xorl    %edx, %edx
> > > -       leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> > > -       cmp     (%rax), %CHAR_REG
> > > +# ifndef USE_AS_STRCHRNUL
> > > +       cmp (%rax), %CHAR_REG
> > >         cmovne  %rdx, %rax
> > >  # endif
> > >         VZEROUPPER
> > >         ret
> > >
> > >  END (STRCHR)
> > > -#endif
> > > +# endif
> > > diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> > > index 583a152794..4dfbe3b58b 100644
> > > --- a/sysdeps/x86_64/multiarch/strchr.c
> > > +++ b/sysdeps/x86_64/multiarch/strchr.c
> > > @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
> > >
> > >    if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
> > >        && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> > >        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > >      return OPTIMIZE (avx2);
> > >
> > > --
> > > 2.29.2
> > >
> >
> > LGTM.
> >
> > Thanks.
> >
>
> This is the updated patch with extra white spaces fixed I am checking in.
>
> --
> H.J.

Awesome! Thanks!

N.G.
  
Noah Goldstein Feb. 8, 2021, 8:57 p.m. UTC | #4
On Mon, Feb 8, 2021 at 2:48 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Feb 8, 2021 at 2:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Feb 8, 2021 at 6:08 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Tue, Feb 2, 2021 at 9:39 PM <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > From: noah <goldstein.w.n@gmail.com>
> > > >
> > > > No bug. Just seemed the performance could be improved a bit. Observed
> > > > and expected behavior are unchanged. Optimized body of main
> > > > loop. Updated page cross logic and optimized accordingly. Made a few
> > > > minor instruction selection modifications. No regressions in test
> > > > suite. Both test-strchrnul and test-strchr passed.
> > > >
> > > > Signed-off-by: noah <goldstein.w.n@gmail.com>
> > > > ---
> > > >  sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
> > > >  sysdeps/x86_64/multiarch/strchr.c      |   1 +
> > > >  2 files changed, 118 insertions(+), 118 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > index d416558d04..8b9d78b55a 100644
> > > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > @@ -27,10 +27,12 @@
> > > >  # ifdef USE_AS_WCSCHR
> > > >  #  define VPBROADCAST  vpbroadcastd
> > > >  #  define VPCMPEQ      vpcmpeqd
> > > > +#  define VPMINU       vpminud
> > > >  #  define CHAR_REG     esi
> > > >  # else
> > > >  #  define VPBROADCAST  vpbroadcastb
> > > >  #  define VPCMPEQ      vpcmpeqb
> > > > +#  define VPMINU       vpminub
> > > >  #  define CHAR_REG     sil
> > > >  # endif
> > > >
> > > > @@ -39,20 +41,26 @@
> > > >  # endif
> > > >
> > > >  # define VEC_SIZE 32
> > > > +# define PAGE_SIZE 4096
> > > >
> > > >         .section .text.avx,"ax",@progbits
> > > >  ENTRY (STRCHR)
> > > >         movl    %edi, %ecx
> > > > -       /* Broadcast CHAR to YMM0.  */
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +       xorl    %edx, %edx
> > > > +# endif
> > > > +
> > > > +       /* Broadcast CHAR to YMM0.      */
> > > >         vmovd   %esi, %xmm0
> > > >         vpxor   %xmm9, %xmm9, %xmm9
> > > >         VPBROADCAST %xmm0, %ymm0
> > > > -       /* Check if we may cross page boundary with one vector load.  */
> > > > -       andl    $(2 * VEC_SIZE - 1), %ecx
> > > > -       cmpl    $VEC_SIZE, %ecx
> > > > -       ja      L(cros_page_boundary)
> > > > -
> > > > -       /* Check the first VEC_SIZE bytes.  Search for both CHAR and the
> > > > +
> > > > +       /* Check if we cross page boundary with one vector load.  */
> > > > +       andl    $(PAGE_SIZE - 1), %ecx
> > > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > > > +       ja  L(cross_page_boundary)
> > > > +
> > > > +       /* Check the first VEC_SIZE bytes.      Search for both CHAR and the
> > > >            null byte.  */
> > > >         vmovdqu (%rdi), %ymm8
> > > >         VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > @@ -60,50 +68,27 @@ ENTRY (STRCHR)
> > > >         vpor    %ymm1, %ymm2, %ymm1
> > > >         vpmovmskb %ymm1, %eax
> > > >         testl   %eax, %eax
> > > > -       jnz     L(first_vec_x0)
> > > > -
> > > > -       /* Align data for aligned loads in the loop.  */
> > > > -       addq    $VEC_SIZE, %rdi
> > > > -       andl    $(VEC_SIZE - 1), %ecx
> > > > -       andq    $-VEC_SIZE, %rdi
> > > > -
> > > > -       jmp     L(more_4x_vec)
> > > > -
> > > > -       .p2align 4
> > > > -L(cros_page_boundary):
> > > > -       andl    $(VEC_SIZE - 1), %ecx
> > > > -       andq    $-VEC_SIZE, %rdi
> > > > -       vmovdqu (%rdi), %ymm8
> > > > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > -       vpor    %ymm1, %ymm2, %ymm1
> > > > -       vpmovmskb %ymm1, %eax
> > > > -       /* Remove the leading bytes.  */
> > > > -       sarl    %cl, %eax
> > > > -       testl   %eax, %eax
> > > > -       jz      L(aligned_more)
> > > > -       /* Found CHAR or the null byte.  */
> > > > +       jz      L(more_vecs)
> > > >         tzcntl  %eax, %eax
> > > > -       addq    %rcx, %rax
> > > > -# ifdef USE_AS_STRCHRNUL
> > > > +       /* Found CHAR or the null byte.  */
> > > >         addq    %rdi, %rax
> > > > -# else
> > > > -       xorl    %edx, %edx
> > > > -       leaq    (%rdi, %rax), %rax
> > > > -       cmp     (%rax), %CHAR_REG
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +       cmp (%rax), %CHAR_REG
> > > >         cmovne  %rdx, %rax
> > > >  # endif
> > > >         VZEROUPPER
> > > >         ret
> > > >
> > > >         .p2align 4
> > > > +L(more_vecs):
> > > > +       /* Align data for aligned loads in the loop.  */
> > > > +       andq    $-VEC_SIZE, %rdi
> > > >  L(aligned_more):
> > > > -       addq    $VEC_SIZE, %rdi
> > > >
> > > > -L(more_4x_vec):
> > > > -       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > > -          since data is only aligned to VEC_SIZE.  */
> > > > -       vmovdqa (%rdi), %ymm8
> > > > +       /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > > +          since data is only aligned to VEC_SIZE.      */
> > > > +       vmovdqa VEC_SIZE(%rdi), %ymm8
> > > > +       addq    $VEC_SIZE, %rdi
> > > >         VPCMPEQ %ymm8, %ymm0, %ymm1
> > > >         VPCMPEQ %ymm8, %ymm9, %ymm2
> > > >         vpor    %ymm1, %ymm2, %ymm1
> > > > @@ -125,7 +110,7 @@ L(more_4x_vec):
> > > >         vpor    %ymm1, %ymm2, %ymm1
> > > >         vpmovmskb %ymm1, %eax
> > > >         testl   %eax, %eax
> > > > -       jnz     L(first_vec_x2)
> > > > +       jnz     L(first_vec_x2)
> > > >
> > > >         vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > > >         VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > @@ -133,122 +118,136 @@ L(more_4x_vec):
> > > >         vpor    %ymm1, %ymm2, %ymm1
> > > >         vpmovmskb %ymm1, %eax
> > > >         testl   %eax, %eax
> > > > -       jnz     L(first_vec_x3)
> > > > -
> > > > -       addq    $(VEC_SIZE * 4), %rdi
> > > > -
> > > > -       /* Align data to 4 * VEC_SIZE.  */
> > > > -       movq    %rdi, %rcx
> > > > -       andl    $(4 * VEC_SIZE - 1), %ecx
> > > > -       andq    $-(4 * VEC_SIZE), %rdi
> > > > -
> > > > -       .p2align 4
> > > > -L(loop_4x_vec):
> > > > -       /* Compare 4 * VEC at a time forward.  */
> > > > -       vmovdqa (%rdi), %ymm5
> > > > -       vmovdqa VEC_SIZE(%rdi), %ymm6
> > > > -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > > > -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > > > -
> > > > -       VPCMPEQ %ymm5, %ymm0, %ymm1
> > > > -       VPCMPEQ %ymm6, %ymm0, %ymm2
> > > > -       VPCMPEQ %ymm7, %ymm0, %ymm3
> > > > -       VPCMPEQ %ymm8, %ymm0, %ymm4
> > > > -
> > > > -       VPCMPEQ %ymm5, %ymm9, %ymm5
> > > > -       VPCMPEQ %ymm6, %ymm9, %ymm6
> > > > -       VPCMPEQ %ymm7, %ymm9, %ymm7
> > > > -       VPCMPEQ %ymm8, %ymm9, %ymm8
> > > > -
> > > > -       vpor    %ymm1, %ymm5, %ymm1
> > > > -       vpor    %ymm2, %ymm6, %ymm2
> > > > -       vpor    %ymm3, %ymm7, %ymm3
> > > > -       vpor    %ymm4, %ymm8, %ymm4
> > > > -
> > > > -       vpor    %ymm1, %ymm2, %ymm5
> > > > -       vpor    %ymm3, %ymm4, %ymm6
> > > > -
> > > > -       vpor    %ymm5, %ymm6, %ymm5
> > > > -
> > > > -       vpmovmskb %ymm5, %eax
> > > > -       testl   %eax, %eax
> > > > -       jnz     L(4x_vec_end)
> > > > -
> > > > -       addq    $(VEC_SIZE * 4), %rdi
> > > > +       jz      L(prep_loop_4x)
> > > >
> > > > -       jmp     L(loop_4x_vec)
> > > > +       tzcntl  %eax, %eax
> > > > +       leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +       cmp (%rax), %CHAR_REG
> > > > +       cmovne  %rdx, %rax
> > > > +# endif
> > > > +       VZEROUPPER
> > > > +       ret
> > > >
> > > >         .p2align 4
> > > >  L(first_vec_x0):
> > > > -       /* Found CHAR or the null byte.  */
> > > >         tzcntl  %eax, %eax
> > > > -# ifdef USE_AS_STRCHRNUL
> > > > +       /* Found CHAR or the null byte.  */
> > > >         addq    %rdi, %rax
> > > > -# else
> > > > -       xorl    %edx, %edx
> > > > -       leaq    (%rdi, %rax), %rax
> > > > -       cmp     (%rax), %CHAR_REG
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +       cmp (%rax), %CHAR_REG
> > > >         cmovne  %rdx, %rax
> > > >  # endif
> > > >         VZEROUPPER
> > > >         ret
> > > > -
> > > > +
> > > >         .p2align 4
> > > >  L(first_vec_x1):
> > > >         tzcntl  %eax, %eax
> > > > -# ifdef USE_AS_STRCHRNUL
> > > > -       addq    $VEC_SIZE, %rax
> > > > -       addq    %rdi, %rax
> > > > -# else
> > > > -       xorl    %edx, %edx
> > > >         leaq    VEC_SIZE(%rdi, %rax), %rax
> > > > -       cmp     (%rax), %CHAR_REG
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +       cmp (%rax), %CHAR_REG
> > > >         cmovne  %rdx, %rax
> > > >  # endif
> > > >         VZEROUPPER
> > > > -       ret
> > > > -
> > > > +       ret
> > > > +
> > > >         .p2align 4
> > > >  L(first_vec_x2):
> > > >         tzcntl  %eax, %eax
> > > > -# ifdef USE_AS_STRCHRNUL
> > > > -       addq    $(VEC_SIZE * 2), %rax
> > > > -       addq    %rdi, %rax
> > > > -# else
> > > > -       xorl    %edx, %edx
> > > > +       /* Found CHAR or the null byte.  */
> > > >         leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > > -       cmp     (%rax), %CHAR_REG
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +       cmp (%rax), %CHAR_REG
> > > >         cmovne  %rdx, %rax
> > > >  # endif
> > > >         VZEROUPPER
> > > >         ret
> > > > +
> > > > +L(prep_loop_4x):
> > > > +       /* Align data to 4 * VEC_SIZE.  */
> > > > +       andq    $-(VEC_SIZE * 4), %rdi
> > > >
> > > >         .p2align 4
> > > > -L(4x_vec_end):
> > > > +L(loop_4x_vec):
> > > > +       /* Compare 4 * VEC at a time forward.  */
> > > > +       vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> > > > +       vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> > > > +       vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> > > > +       vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> > > > +
> > > > +       /* Leaves only CHARS matching esi as 0.  */
> > > > +       vpxor   %ymm5, %ymm0, %ymm1
> > > > +       vpxor   %ymm6, %ymm0, %ymm2
> > > > +       vpxor   %ymm7, %ymm0, %ymm3
> > > > +       vpxor   %ymm8, %ymm0, %ymm4
> > > > +
> > > > +       VPMINU  %ymm1, %ymm5, %ymm1
> > > > +       VPMINU  %ymm2, %ymm6, %ymm2
> > > > +       VPMINU  %ymm3, %ymm7, %ymm3
> > > > +       VPMINU  %ymm4, %ymm8, %ymm4
> > > > +
> > > > +       VPMINU  %ymm1, %ymm2, %ymm5
> > > > +       VPMINU  %ymm3, %ymm4, %ymm6
> > > > +
> > > > +       VPMINU  %ymm5, %ymm6, %ymm5
> > > > +
> > > > +       VPCMPEQ %ymm5, %ymm9, %ymm5
> > > > +       vpmovmskb %ymm5, %eax
> > > > +
> > > > +       addq    $(VEC_SIZE * 4), %rdi
> > > > +       testl   %eax, %eax
> > > > +       jz  L(loop_4x_vec)
> > > > +
> > > > +       VPCMPEQ %ymm1, %ymm9, %ymm1
> > > >         vpmovmskb %ymm1, %eax
> > > >         testl   %eax, %eax
> > > >         jnz     L(first_vec_x0)
> > > > +
> > > > +       VPCMPEQ %ymm2, %ymm9, %ymm2
> > > >         vpmovmskb %ymm2, %eax
> > > >         testl   %eax, %eax
> > > >         jnz     L(first_vec_x1)
> > > > -       vpmovmskb %ymm3, %eax
> > > > -       testl   %eax, %eax
> > > > -       jnz     L(first_vec_x2)
> > > > +
> > > > +       VPCMPEQ %ymm3, %ymm9, %ymm3
> > > > +       VPCMPEQ %ymm4, %ymm9, %ymm4
> > > > +       vpmovmskb %ymm3, %ecx
> > > >         vpmovmskb %ymm4, %eax
> > > > +       salq    $32, %rax
> > > > +       orq %rcx, %rax
> > > > +       tzcntq  %rax, %rax
> > > > +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +       cmp (%rax), %CHAR_REG
> > > > +       cmovne  %rdx, %rax
> > > > +# endif
> > > > +       VZEROUPPER
> > > > +       ret
> > > > +
> > > > +       /* Cold case for crossing page with first load.  */
> > > > +       .p2align 4
> > > > +L(cross_page_boundary):
> > > > +       andq    $-VEC_SIZE, %rdi
> > > > +       andl    $(VEC_SIZE - 1), %ecx
> > > > +
> > > > +       vmovdqa (%rdi), %ymm8
> > > > +       VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > +       VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > +       vpor    %ymm1, %ymm2, %ymm1
> > > > +       vpmovmskb %ymm1, %eax
> > > > +       /* Remove the leading bits.      */
> > > > +       sarxl   %ecx, %eax, %eax
> > > >         testl   %eax, %eax
> > > > -L(first_vec_x3):
> > > > +       jz      L(aligned_more)
> > > >         tzcntl  %eax, %eax
> > > > -# ifdef USE_AS_STRCHRNUL
> > > > -       addq    $(VEC_SIZE * 3), %rax
> > > > +       addq    %rcx, %rdi
> > > >         addq    %rdi, %rax
> > > > -# else
> > > > -       xorl    %edx, %edx
> > > > -       leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> > > > -       cmp     (%rax), %CHAR_REG
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +       cmp (%rax), %CHAR_REG
> > > >         cmovne  %rdx, %rax
> > > >  # endif
> > > >         VZEROUPPER
> > > >         ret
> > > >
> > > >  END (STRCHR)
> > > > -#endif
> > > > +# endif
> > > > diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> > > > index 583a152794..4dfbe3b58b 100644
> > > > --- a/sysdeps/x86_64/multiarch/strchr.c
> > > > +++ b/sysdeps/x86_64/multiarch/strchr.c
> > > > @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
> > > >
> > > >    if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
> > > >        && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > > +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> > > >        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > >      return OPTIMIZE (avx2);
> > > >
> > > > --
> > > > 2.29.2
> > > >
> > >
> > > LGTM.
> > >
> > > Thanks.
> > >
> >
> > This is the updated patch with extra white spaces fixed I am checking in.
> >
> > --
> > H.J.
>
> Awesome! Thanks!
>
> N.G.

Shoot, just realized this one has the old commit message that only
references test-strchr and test-strchrnul as passing (missing
reference to test-wcschr and test-wcschrnul).

Do you want me to send another patch with proper commit message or can
you fix it on your end or do is not really matter?

N.G.
  
Sunil Pandey April 27, 2022, 11:43 p.m. UTC | #5
On Mon, Feb 8, 2021 at 1:46 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Mon, Feb 8, 2021 at 2:48 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Mon, Feb 8, 2021 at 2:33 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Feb 8, 2021 at 6:08 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Tue, Feb 2, 2021 at 9:39 PM <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > From: noah <goldstein.w.n@gmail.com>
> > > > >
> > > > > No bug. Just seemed the performance could be improved a bit. Observed
> > > > > and expected behavior are unchanged. Optimized body of main
> > > > > loop. Updated page cross logic and optimized accordingly. Made a few
> > > > > minor instruction selection modifications. No regressions in test
> > > > > suite. Both test-strchrnul and test-strchr passed.
> > > > >
> > > > > Signed-off-by: noah <goldstein.w.n@gmail.com>
> > > > > ---
> > > > >  sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
> > > > >  sysdeps/x86_64/multiarch/strchr.c      |   1 +
> > > > >  2 files changed, 118 insertions(+), 118 deletions(-)
> > > > >
> > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > > index d416558d04..8b9d78b55a 100644
> > > > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > > > > @@ -27,10 +27,12 @@
> > > > >  # ifdef USE_AS_WCSCHR
> > > > >  #  define VPBROADCAST  vpbroadcastd
> > > > >  #  define VPCMPEQ      vpcmpeqd
> > > > > +#  define VPMINU       vpminud
> > > > >  #  define CHAR_REG     esi
> > > > >  # else
> > > > >  #  define VPBROADCAST  vpbroadcastb
> > > > >  #  define VPCMPEQ      vpcmpeqb
> > > > > +#  define VPMINU       vpminub
> > > > >  #  define CHAR_REG     sil
> > > > >  # endif
> > > > >
> > > > > @@ -39,20 +41,26 @@
> > > > >  # endif
> > > > >
> > > > >  # define VEC_SIZE 32
> > > > > +# define PAGE_SIZE 4096
> > > > >
> > > > >         .section .text.avx,"ax",@progbits
> > > > >  ENTRY (STRCHR)
> > > > >         movl    %edi, %ecx
> > > > > -       /* Broadcast CHAR to YMM0.  */
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > +       xorl    %edx, %edx
> > > > > +# endif
> > > > > +
> > > > > +       /* Broadcast CHAR to YMM0.      */
> > > > >         vmovd   %esi, %xmm0
> > > > >         vpxor   %xmm9, %xmm9, %xmm9
> > > > >         VPBROADCAST %xmm0, %ymm0
> > > > > -       /* Check if we may cross page boundary with one vector load.  */
> > > > > -       andl    $(2 * VEC_SIZE - 1), %ecx
> > > > > -       cmpl    $VEC_SIZE, %ecx
> > > > > -       ja      L(cros_page_boundary)
> > > > > -
> > > > > -       /* Check the first VEC_SIZE bytes.  Search for both CHAR and the
> > > > > +
> > > > > +       /* Check if we cross page boundary with one vector load.  */
> > > > > +       andl    $(PAGE_SIZE - 1), %ecx
> > > > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > > > > +       ja  L(cross_page_boundary)
> > > > > +
> > > > > +       /* Check the first VEC_SIZE bytes.      Search for both CHAR and the
> > > > >            null byte.  */
> > > > >         vmovdqu (%rdi), %ymm8
> > > > >         VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > > @@ -60,50 +68,27 @@ ENTRY (STRCHR)
> > > > >         vpor    %ymm1, %ymm2, %ymm1
> > > > >         vpmovmskb %ymm1, %eax
> > > > >         testl   %eax, %eax
> > > > > -       jnz     L(first_vec_x0)
> > > > > -
> > > > > -       /* Align data for aligned loads in the loop.  */
> > > > > -       addq    $VEC_SIZE, %rdi
> > > > > -       andl    $(VEC_SIZE - 1), %ecx
> > > > > -       andq    $-VEC_SIZE, %rdi
> > > > > -
> > > > > -       jmp     L(more_4x_vec)
> > > > > -
> > > > > -       .p2align 4
> > > > > -L(cros_page_boundary):
> > > > > -       andl    $(VEC_SIZE - 1), %ecx
> > > > > -       andq    $-VEC_SIZE, %rdi
> > > > > -       vmovdqu (%rdi), %ymm8
> > > > > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > > -       vpor    %ymm1, %ymm2, %ymm1
> > > > > -       vpmovmskb %ymm1, %eax
> > > > > -       /* Remove the leading bytes.  */
> > > > > -       sarl    %cl, %eax
> > > > > -       testl   %eax, %eax
> > > > > -       jz      L(aligned_more)
> > > > > -       /* Found CHAR or the null byte.  */
> > > > > +       jz      L(more_vecs)
> > > > >         tzcntl  %eax, %eax
> > > > > -       addq    %rcx, %rax
> > > > > -# ifdef USE_AS_STRCHRNUL
> > > > > +       /* Found CHAR or the null byte.  */
> > > > >         addq    %rdi, %rax
> > > > > -# else
> > > > > -       xorl    %edx, %edx
> > > > > -       leaq    (%rdi, %rax), %rax
> > > > > -       cmp     (%rax), %CHAR_REG
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > +       cmp (%rax), %CHAR_REG
> > > > >         cmovne  %rdx, %rax
> > > > >  # endif
> > > > >         VZEROUPPER
> > > > >         ret
> > > > >
> > > > >         .p2align 4
> > > > > +L(more_vecs):
> > > > > +       /* Align data for aligned loads in the loop.  */
> > > > > +       andq    $-VEC_SIZE, %rdi
> > > > >  L(aligned_more):
> > > > > -       addq    $VEC_SIZE, %rdi
> > > > >
> > > > > -L(more_4x_vec):
> > > > > -       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > > > -          since data is only aligned to VEC_SIZE.  */
> > > > > -       vmovdqa (%rdi), %ymm8
> > > > > +       /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > > > +          since data is only aligned to VEC_SIZE.      */
> > > > > +       vmovdqa VEC_SIZE(%rdi), %ymm8
> > > > > +       addq    $VEC_SIZE, %rdi
> > > > >         VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > >         VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > >         vpor    %ymm1, %ymm2, %ymm1
> > > > > @@ -125,7 +110,7 @@ L(more_4x_vec):
> > > > >         vpor    %ymm1, %ymm2, %ymm1
> > > > >         vpmovmskb %ymm1, %eax
> > > > >         testl   %eax, %eax
> > > > > -       jnz     L(first_vec_x2)
> > > > > +       jnz     L(first_vec_x2)
> > > > >
> > > > >         vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > > > >         VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > > @@ -133,122 +118,136 @@ L(more_4x_vec):
> > > > >         vpor    %ymm1, %ymm2, %ymm1
> > > > >         vpmovmskb %ymm1, %eax
> > > > >         testl   %eax, %eax
> > > > > -       jnz     L(first_vec_x3)
> > > > > -
> > > > > -       addq    $(VEC_SIZE * 4), %rdi
> > > > > -
> > > > > -       /* Align data to 4 * VEC_SIZE.  */
> > > > > -       movq    %rdi, %rcx
> > > > > -       andl    $(4 * VEC_SIZE - 1), %ecx
> > > > > -       andq    $-(4 * VEC_SIZE), %rdi
> > > > > -
> > > > > -       .p2align 4
> > > > > -L(loop_4x_vec):
> > > > > -       /* Compare 4 * VEC at a time forward.  */
> > > > > -       vmovdqa (%rdi), %ymm5
> > > > > -       vmovdqa VEC_SIZE(%rdi), %ymm6
> > > > > -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > > > > -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > > > > -
> > > > > -       VPCMPEQ %ymm5, %ymm0, %ymm1
> > > > > -       VPCMPEQ %ymm6, %ymm0, %ymm2
> > > > > -       VPCMPEQ %ymm7, %ymm0, %ymm3
> > > > > -       VPCMPEQ %ymm8, %ymm0, %ymm4
> > > > > -
> > > > > -       VPCMPEQ %ymm5, %ymm9, %ymm5
> > > > > -       VPCMPEQ %ymm6, %ymm9, %ymm6
> > > > > -       VPCMPEQ %ymm7, %ymm9, %ymm7
> > > > > -       VPCMPEQ %ymm8, %ymm9, %ymm8
> > > > > -
> > > > > -       vpor    %ymm1, %ymm5, %ymm1
> > > > > -       vpor    %ymm2, %ymm6, %ymm2
> > > > > -       vpor    %ymm3, %ymm7, %ymm3
> > > > > -       vpor    %ymm4, %ymm8, %ymm4
> > > > > -
> > > > > -       vpor    %ymm1, %ymm2, %ymm5
> > > > > -       vpor    %ymm3, %ymm4, %ymm6
> > > > > -
> > > > > -       vpor    %ymm5, %ymm6, %ymm5
> > > > > -
> > > > > -       vpmovmskb %ymm5, %eax
> > > > > -       testl   %eax, %eax
> > > > > -       jnz     L(4x_vec_end)
> > > > > -
> > > > > -       addq    $(VEC_SIZE * 4), %rdi
> > > > > +       jz      L(prep_loop_4x)
> > > > >
> > > > > -       jmp     L(loop_4x_vec)
> > > > > +       tzcntl  %eax, %eax
> > > > > +       leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > +       cmp (%rax), %CHAR_REG
> > > > > +       cmovne  %rdx, %rax
> > > > > +# endif
> > > > > +       VZEROUPPER
> > > > > +       ret
> > > > >
> > > > >         .p2align 4
> > > > >  L(first_vec_x0):
> > > > > -       /* Found CHAR or the null byte.  */
> > > > >         tzcntl  %eax, %eax
> > > > > -# ifdef USE_AS_STRCHRNUL
> > > > > +       /* Found CHAR or the null byte.  */
> > > > >         addq    %rdi, %rax
> > > > > -# else
> > > > > -       xorl    %edx, %edx
> > > > > -       leaq    (%rdi, %rax), %rax
> > > > > -       cmp     (%rax), %CHAR_REG
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > +       cmp (%rax), %CHAR_REG
> > > > >         cmovne  %rdx, %rax
> > > > >  # endif
> > > > >         VZEROUPPER
> > > > >         ret
> > > > > -
> > > > > +
> > > > >         .p2align 4
> > > > >  L(first_vec_x1):
> > > > >         tzcntl  %eax, %eax
> > > > > -# ifdef USE_AS_STRCHRNUL
> > > > > -       addq    $VEC_SIZE, %rax
> > > > > -       addq    %rdi, %rax
> > > > > -# else
> > > > > -       xorl    %edx, %edx
> > > > >         leaq    VEC_SIZE(%rdi, %rax), %rax
> > > > > -       cmp     (%rax), %CHAR_REG
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > +       cmp (%rax), %CHAR_REG
> > > > >         cmovne  %rdx, %rax
> > > > >  # endif
> > > > >         VZEROUPPER
> > > > > -       ret
> > > > > -
> > > > > +       ret
> > > > > +
> > > > >         .p2align 4
> > > > >  L(first_vec_x2):
> > > > >         tzcntl  %eax, %eax
> > > > > -# ifdef USE_AS_STRCHRNUL
> > > > > -       addq    $(VEC_SIZE * 2), %rax
> > > > > -       addq    %rdi, %rax
> > > > > -# else
> > > > > -       xorl    %edx, %edx
> > > > > +       /* Found CHAR or the null byte.  */
> > > > >         leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > > > -       cmp     (%rax), %CHAR_REG
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > +       cmp (%rax), %CHAR_REG
> > > > >         cmovne  %rdx, %rax
> > > > >  # endif
> > > > >         VZEROUPPER
> > > > >         ret
> > > > > +
> > > > > +L(prep_loop_4x):
> > > > > +       /* Align data to 4 * VEC_SIZE.  */
> > > > > +       andq    $-(VEC_SIZE * 4), %rdi
> > > > >
> > > > >         .p2align 4
> > > > > -L(4x_vec_end):
> > > > > +L(loop_4x_vec):
> > > > > +       /* Compare 4 * VEC at a time forward.  */
> > > > > +       vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> > > > > +       vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> > > > > +       vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> > > > > +       vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> > > > > +
> > > > > +       /* Leaves only CHARS matching esi as 0.  */
> > > > > +       vpxor   %ymm5, %ymm0, %ymm1
> > > > > +       vpxor   %ymm6, %ymm0, %ymm2
> > > > > +       vpxor   %ymm7, %ymm0, %ymm3
> > > > > +       vpxor   %ymm8, %ymm0, %ymm4
> > > > > +
> > > > > +       VPMINU  %ymm1, %ymm5, %ymm1
> > > > > +       VPMINU  %ymm2, %ymm6, %ymm2
> > > > > +       VPMINU  %ymm3, %ymm7, %ymm3
> > > > > +       VPMINU  %ymm4, %ymm8, %ymm4
> > > > > +
> > > > > +       VPMINU  %ymm1, %ymm2, %ymm5
> > > > > +       VPMINU  %ymm3, %ymm4, %ymm6
> > > > > +
> > > > > +       VPMINU  %ymm5, %ymm6, %ymm5
> > > > > +
> > > > > +       VPCMPEQ %ymm5, %ymm9, %ymm5
> > > > > +       vpmovmskb %ymm5, %eax
> > > > > +
> > > > > +       addq    $(VEC_SIZE * 4), %rdi
> > > > > +       testl   %eax, %eax
> > > > > +       jz  L(loop_4x_vec)
> > > > > +
> > > > > +       VPCMPEQ %ymm1, %ymm9, %ymm1
> > > > >         vpmovmskb %ymm1, %eax
> > > > >         testl   %eax, %eax
> > > > >         jnz     L(first_vec_x0)
> > > > > +
> > > > > +       VPCMPEQ %ymm2, %ymm9, %ymm2
> > > > >         vpmovmskb %ymm2, %eax
> > > > >         testl   %eax, %eax
> > > > >         jnz     L(first_vec_x1)
> > > > > -       vpmovmskb %ymm3, %eax
> > > > > -       testl   %eax, %eax
> > > > > -       jnz     L(first_vec_x2)
> > > > > +
> > > > > +       VPCMPEQ %ymm3, %ymm9, %ymm3
> > > > > +       VPCMPEQ %ymm4, %ymm9, %ymm4
> > > > > +       vpmovmskb %ymm3, %ecx
> > > > >         vpmovmskb %ymm4, %eax
> > > > > +       salq    $32, %rax
> > > > > +       orq %rcx, %rax
> > > > > +       tzcntq  %rax, %rax
> > > > > +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > +       cmp (%rax), %CHAR_REG
> > > > > +       cmovne  %rdx, %rax
> > > > > +# endif
> > > > > +       VZEROUPPER
> > > > > +       ret
> > > > > +
> > > > > +       /* Cold case for crossing page with first load.  */
> > > > > +       .p2align 4
> > > > > +L(cross_page_boundary):
> > > > > +       andq    $-VEC_SIZE, %rdi
> > > > > +       andl    $(VEC_SIZE - 1), %ecx
> > > > > +
> > > > > +       vmovdqa (%rdi), %ymm8
> > > > > +       VPCMPEQ %ymm8, %ymm0, %ymm1
> > > > > +       VPCMPEQ %ymm8, %ymm9, %ymm2
> > > > > +       vpor    %ymm1, %ymm2, %ymm1
> > > > > +       vpmovmskb %ymm1, %eax
> > > > > +       /* Remove the leading bits.      */
> > > > > +       sarxl   %ecx, %eax, %eax
> > > > >         testl   %eax, %eax
> > > > > -L(first_vec_x3):
> > > > > +       jz      L(aligned_more)
> > > > >         tzcntl  %eax, %eax
> > > > > -# ifdef USE_AS_STRCHRNUL
> > > > > -       addq    $(VEC_SIZE * 3), %rax
> > > > > +       addq    %rcx, %rdi
> > > > >         addq    %rdi, %rax
> > > > > -# else
> > > > > -       xorl    %edx, %edx
> > > > > -       leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> > > > > -       cmp     (%rax), %CHAR_REG
> > > > > +# ifndef USE_AS_STRCHRNUL
> > > > > +       cmp (%rax), %CHAR_REG
> > > > >         cmovne  %rdx, %rax
> > > > >  # endif
> > > > >         VZEROUPPER
> > > > >         ret
> > > > >
> > > > >  END (STRCHR)
> > > > > -#endif
> > > > > +# endif
> > > > > diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> > > > > index 583a152794..4dfbe3b58b 100644
> > > > > --- a/sysdeps/x86_64/multiarch/strchr.c
> > > > > +++ b/sysdeps/x86_64/multiarch/strchr.c
> > > > > @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
> > > > >
> > > > >    if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
> > > > >        && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > > > > +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> > > > >        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> > > > >      return OPTIMIZE (avx2);
> > > > >
> > > > > --
> > > > > 2.29.2
> > > > >
> > > >
> > > > LGTM.
> > > >
> > > > Thanks.
> > > >
> > >
> > > This is the updated patch with extra white spaces fixed I am checking in.
> > >
> > > --
> > > H.J.
> >
> > Awesome! Thanks!
> >
> > N.G.
>
> Shoot, just realized this one has the old commit message that only
> references test-strchr and test-strchrnul as passing (missing
> reference to test-wcschr and test-wcschrnul).
>
> Do you want me to send another patch with proper commit message or can
> you fix it on your end or do is not really matter?
>
> N.G.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
  

Patch

diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index d416558d04..8b9d78b55a 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -27,10 +27,12 @@ 
 # ifdef USE_AS_WCSCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
+#  define VPMINU	vpminud
 #  define CHAR_REG	esi
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
+#  define VPMINU	vpminub
 #  define CHAR_REG	sil
 # endif
 
@@ -39,20 +41,26 @@ 
 # endif
 
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
 
 	.section .text.avx,"ax",@progbits
 ENTRY (STRCHR)
 	movl	%edi, %ecx
-	/* Broadcast CHAR to YMM0.  */
+# ifndef USE_AS_STRCHRNUL
+	xorl	%edx, %edx
+# endif
+	
+	/* Broadcast CHAR to YMM0.	*/
 	vmovd	%esi, %xmm0
 	vpxor	%xmm9, %xmm9, %xmm9
 	VPBROADCAST %xmm0, %ymm0
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
-
-	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
+	
+	/* Check if we cross page boundary with one vector load.  */
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja  L(cross_page_boundary)
+	
+	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
 	   null byte.  */
 	vmovdqu	(%rdi), %ymm8
 	VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -60,50 +68,27 @@  ENTRY (STRCHR)
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-
-	jmp	L(more_4x_vec)
-
-	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	vmovdqu	(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	/* Remove the leading bytes.  */
-	sarl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	/* Found CHAR or the null byte.  */
+	jz	L(more_vecs)
 	tzcntl	%eax, %eax
-	addq	%rcx, %rax
-# ifdef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
 
 	.p2align 4
+L(more_vecs):	 
+	/* Align data for aligned loads in the loop.  */
+	andq	$-VEC_SIZE, %rdi
 L(aligned_more):
-	addq	$VEC_SIZE, %rdi
 
-L(more_4x_vec):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vmovdqa	(%rdi), %ymm8
+	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.	*/
+	vmovdqa	VEC_SIZE(%rdi), %ymm8
+	addq	$VEC_SIZE, %rdi
 	VPCMPEQ %ymm8, %ymm0, %ymm1
 	VPCMPEQ %ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
@@ -125,7 +110,7 @@  L(more_4x_vec):
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x2)
+	jnz	L(first_vec_x2)	   
 
 	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 	VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -133,122 +118,136 @@  L(more_4x_vec):
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x3)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
-
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa	(%rdi), %ymm5
-	vmovdqa	VEC_SIZE(%rdi), %ymm6
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
-
-	VPCMPEQ %ymm5, %ymm0, %ymm1
-	VPCMPEQ %ymm6, %ymm0, %ymm2
-	VPCMPEQ %ymm7, %ymm0, %ymm3
-	VPCMPEQ %ymm8, %ymm0, %ymm4
-
-	VPCMPEQ %ymm5, %ymm9, %ymm5
-	VPCMPEQ %ymm6, %ymm9, %ymm6
-	VPCMPEQ %ymm7, %ymm9, %ymm7
-	VPCMPEQ %ymm8, %ymm9, %ymm8
-
-	vpor	%ymm1, %ymm5, %ymm1
-	vpor	%ymm2, %ymm6, %ymm2
-	vpor	%ymm3, %ymm7, %ymm3
-	vpor	%ymm4, %ymm8, %ymm4
-
-	vpor	%ymm1, %ymm2, %ymm5
-	vpor	%ymm3, %ymm4, %ymm6
-
-	vpor	%ymm5, %ymm6, %ymm5
-
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
+	jz	L(prep_loop_4x)
 
-	jmp	L(loop_4x_vec)
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
 
 	.p2align 4
 L(first_vec_x0):
-	/* Found CHAR or the null byte.  */
 	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
-
+	
 	.p2align 4
 L(first_vec_x1):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
 	leaq	VEC_SIZE(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
-	ret
-
+	ret	   
+	
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
+	/* Found CHAR or the null byte.	 */
 	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
+	
+L(prep_loop_4x):
+	/* Align data to 4 * VEC_SIZE.	*/
+	andq	$-(VEC_SIZE * 4), %rdi
 
 	.p2align 4
-L(4x_vec_end):
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
+	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
+	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
+
+	/* Leaves only CHARS matching esi as 0.	 */
+	vpxor	%ymm5, %ymm0, %ymm1
+	vpxor	%ymm6, %ymm0, %ymm2
+	vpxor	%ymm7, %ymm0, %ymm3
+	vpxor	%ymm8, %ymm0, %ymm4
+
+	VPMINU	%ymm1, %ymm5, %ymm1
+	VPMINU	%ymm2, %ymm6, %ymm2
+	VPMINU	%ymm3, %ymm7, %ymm3
+	VPMINU	%ymm4, %ymm8, %ymm4
+
+	VPMINU	%ymm1, %ymm2, %ymm5
+	VPMINU	%ymm3, %ymm4, %ymm6
+
+	VPMINU	%ymm5, %ymm6, %ymm5
+
+	VPCMPEQ %ymm5, %ymm9, %ymm5
+	vpmovmskb %ymm5, %eax
+
+	addq	$(VEC_SIZE * 4), %rdi
+	testl	%eax, %eax
+	jz  L(loop_4x_vec)
+	
+	VPCMPEQ %ymm1, %ymm9, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x0)
+
+	VPCMPEQ %ymm2, %ymm9, %ymm2
 	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
+
+	VPCMPEQ %ymm3, %ymm9, %ymm3
+	VPCMPEQ %ymm4, %ymm9, %ymm4
+	vpmovmskb %ymm3, %ecx
 	vpmovmskb %ymm4, %eax
+	salq	$32, %rax
+	orq %rcx, %rax
+	tzcntq  %rax, %rax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	/* Cold case for crossing page with first load.	 */
+	.p2align 4
+L(cross_page_boundary):
+	andq	$-VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+
+	vmovdqa	(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Remove the leading bits.	 */
+	sarxl	%ecx, %eax, %eax
 	testl	%eax, %eax
-L(first_vec_x3):
+	jz	L(aligned_more)	   
 	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$(VEC_SIZE * 3), %rax
+	addq	%rcx, %rdi
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
 
 END (STRCHR)
-#endif
+# endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index 583a152794..4dfbe3b58b 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -37,6 +37,7 @@  IFUNC_SELECTOR (void)
 
   if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
       && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     return OPTIMIZE (avx2);