[v6,7/8] x86: Shrink code size of memchr-avx2.S

Message ID 20220607041134.2369903-7-goldstein.w.n@gmail.com
State Committed
Commit 6dcbb7d95dded20153b12d76d2f4e0ef0cda4f35
Headers
Series [v6,1/8] x86: Create header for VEC classes in x86 strings library |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Noah Goldstein June 7, 2022, 4:11 a.m. UTC
  This is not meant as a performance optimization. The previous code was
far to liberal in aligning targets and wasted code size unnecissarily.

The total code size saving is: 59 bytes

There are no major changes in the benchmarks.
Geometric Mean of all benchmarks New / Old: 0.967

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S |   1 +
 sysdeps/x86_64/multiarch/memchr-avx2.S     | 109 +++++++++++----------
 2 files changed, 60 insertions(+), 50 deletions(-)
  

Comments

H.J. Lu June 7, 2022, 6:18 p.m. UTC | #1
On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This is not meant as a performance optimization. The previous code was
> far to liberal in aligning targets and wasted code size unnecissarily.
>
> The total code size saving is: 59 bytes
>
> There are no major changes in the benchmarks.
> Geometric Mean of all benchmarks New / Old: 0.967
>
> Full xcheck passes on x86_64.
> ---
>  sysdeps/x86_64/multiarch/memchr-avx2-rtm.S |   1 +
>  sysdeps/x86_64/multiarch/memchr-avx2.S     | 109 +++++++++++----------
>  2 files changed, 60 insertions(+), 50 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> index 87b076c7c4..c4d71938c5 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> @@ -2,6 +2,7 @@
>  # define MEMCHR __memchr_avx2_rtm
>  #endif
>
> +#define COND_VZEROUPPER        COND_VZEROUPPER_XTEST
>  #define ZERO_UPPER_VEC_REGISTERS_RETURN \
>    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 75bd7262e0..28a01280ec 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -57,7 +57,7 @@
>  # define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
>         .section SECTION(.text),"ax",@progbits
> -ENTRY (MEMCHR)
> +ENTRY_P2ALIGN (MEMCHR, 5)
>  # ifndef USE_AS_RAWMEMCHR
>         /* Check for zero length.  */
>  #  ifdef __ILP32__
> @@ -87,12 +87,14 @@ ENTRY (MEMCHR)
>  # endif
>         testl   %eax, %eax
>         jz      L(aligned_more)
> -       tzcntl  %eax, %eax
> +       bsfl    %eax, %eax
>         addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> +L(return_vzeroupper):
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
> +
>
>  # ifndef USE_AS_RAWMEMCHR
> -       .p2align 5
> +       .p2align 4
>  L(first_vec_x0):
>         /* Check if first match was before length.  */
>         tzcntl  %eax, %eax
> @@ -100,58 +102,31 @@ L(first_vec_x0):
>         /* NB: Multiply length by 4 to get byte count.  */
>         sall    $2, %edx
>  #  endif
> -       xorl    %ecx, %ecx
> +    COND_VZEROUPPER
> +       /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
> +          block. branch here as opposed to cmovcc is not that costly. Common
> +          usage of memchr is to check if the return was NULL (if string was
> +          known to contain CHAR user would use rawmemchr). This branch will be
> +          highly correlated with the user branch and can be used by most
> +          modern branch predictors to predict the user branch.  */
>         cmpl    %eax, %edx
> -       leaq    (%rdi, %rax), %rax
> -       cmovle  %rcx, %rax
> -       VZEROUPPER_RETURN
> -
> -L(null):
> -       xorl    %eax, %eax
> -       ret
> -# endif
> -       .p2align 4
> -L(cross_page_boundary):
> -       /* Save pointer before aligning as its original value is
> -          necessary for computer return address if byte is found or
> -          adjusting length if it is not and this is memchr.  */
> -       movq    %rdi, %rcx
> -       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
> -          and rdi for rawmemchr.  */
> -       orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
> -       VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Calculate length until end of page (length checked for a
> -          match).  */
> -       leaq    1(%ALGN_PTR_REG), %rsi
> -       subq    %RRAW_PTR_REG, %rsi
> -#  ifdef USE_AS_WMEMCHR
> -       /* NB: Divide bytes by 4 to get wchar_t count.  */
> -       shrl    $2, %esi
> -#  endif
> -# endif
> -       /* Remove the leading bytes.  */
> -       sarxl   %ERAW_PTR_REG, %eax, %eax
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Check the end of data.  */
> -       cmpq    %rsi, %rdx
> -       jbe     L(first_vec_x0)
> +    jle  L(null)
> +       addq    %rdi, %rax
> +    ret
>  # endif
> -       testl   %eax, %eax
> -       jz      L(cross_page_continue)
> -       tzcntl  %eax, %eax
> -       addq    %RRAW_PTR_REG, %rax
> -L(return_vzeroupper):
> -       ZERO_UPPER_VEC_REGISTERS_RETURN
>
> -       .p2align 4
> +       .p2align 4,, 10
>  L(first_vec_x1):
> -       tzcntl  %eax, %eax
> +       bsfl    %eax, %eax
>         incq    %rdi
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
> -
> +# ifndef USE_AS_RAWMEMCHR
> +       /* First in aligning bytes here.  */
> +L(null):
> +       xorl    %eax, %eax
> +       ret
> +# endif
>         .p2align 4
>  L(first_vec_x2):
>         tzcntl  %eax, %eax
> @@ -340,7 +315,7 @@ L(first_vec_x1_check):
>         incq    %rdi
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
> -       .p2align 4
> +       .p2align 4,, 6
>  L(set_zero_end):
>         xorl    %eax, %eax
>         VZEROUPPER_RETURN
> @@ -428,5 +403,39 @@ L(last_vec_x3):
>         VZEROUPPER_RETURN
>  # endif
>
> +       .p2align 4
> +L(cross_page_boundary):
> +       /* Save pointer before aligning as its original value is necessary for
> +          computer return address if byte is found or adjusting length if it
> +          is not and this is memchr.  */
> +       movq    %rdi, %rcx
> +       /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
> +          rawmemchr.  */
> +       andq    $-VEC_SIZE, %ALGN_PTR_REG
> +       VPCMPEQ (%ALGN_PTR_REG), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +       /* Calculate length until end of page (length checked for a match).  */
> +       leal    VEC_SIZE(%ALGN_PTR_REG), %esi
> +       subl    %ERAW_PTR_REG, %esi
> +# ifdef USE_AS_WMEMCHR
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> +       shrl    $2, %esi
> +# endif
> +# endif
> +       /* Remove the leading bytes.  */
> +       sarxl   %ERAW_PTR_REG, %eax, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +       /* Check the end of data.  */
> +       cmpq    %rsi, %rdx
> +       jbe     L(first_vec_x0)
> +# endif
> +       testl   %eax, %eax
> +       jz      L(cross_page_continue)
> +       bsfl    %eax, %eax
> +       addq    %RRAW_PTR_REG, %rax
> +       VZEROUPPER_RETURN
> +
> +
>  END (MEMCHR)
>  #endif
> --
> 2.34.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
  
Sunil Pandey July 14, 2022, 2:31 a.m. UTC | #2
On Tue, Jun 7, 2022 at 11:19 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > This is not meant as a performance optimization. The previous code was
> > far to liberal in aligning targets and wasted code size unnecissarily.
> >
> > The total code size saving is: 59 bytes
> >
> > There are no major changes in the benchmarks.
> > Geometric Mean of all benchmarks New / Old: 0.967
> >
> > Full xcheck passes on x86_64.
> > ---
> >  sysdeps/x86_64/multiarch/memchr-avx2-rtm.S |   1 +
> >  sysdeps/x86_64/multiarch/memchr-avx2.S     | 109 +++++++++++----------
> >  2 files changed, 60 insertions(+), 50 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> > index 87b076c7c4..c4d71938c5 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> > @@ -2,6 +2,7 @@
> >  # define MEMCHR __memchr_avx2_rtm
> >  #endif
> >
> > +#define COND_VZEROUPPER        COND_VZEROUPPER_XTEST
> >  #define ZERO_UPPER_VEC_REGISTERS_RETURN \
> >    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > index 75bd7262e0..28a01280ec 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > @@ -57,7 +57,7 @@
> >  # define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> >
> >         .section SECTION(.text),"ax",@progbits
> > -ENTRY (MEMCHR)
> > +ENTRY_P2ALIGN (MEMCHR, 5)
> >  # ifndef USE_AS_RAWMEMCHR
> >         /* Check for zero length.  */
> >  #  ifdef __ILP32__
> > @@ -87,12 +87,14 @@ ENTRY (MEMCHR)
> >  # endif
> >         testl   %eax, %eax
> >         jz      L(aligned_more)
> > -       tzcntl  %eax, %eax
> > +       bsfl    %eax, %eax
> >         addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> > +L(return_vzeroupper):
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN
> > +
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > -       .p2align 5
> > +       .p2align 4
> >  L(first_vec_x0):
> >         /* Check if first match was before length.  */
> >         tzcntl  %eax, %eax
> > @@ -100,58 +102,31 @@ L(first_vec_x0):
> >         /* NB: Multiply length by 4 to get byte count.  */
> >         sall    $2, %edx
> >  #  endif
> > -       xorl    %ecx, %ecx
> > +    COND_VZEROUPPER
> > +       /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
> > +          block. branch here as opposed to cmovcc is not that costly. Common
> > +          usage of memchr is to check if the return was NULL (if string was
> > +          known to contain CHAR user would use rawmemchr). This branch will be
> > +          highly correlated with the user branch and can be used by most
> > +          modern branch predictors to predict the user branch.  */
> >         cmpl    %eax, %edx
> > -       leaq    (%rdi, %rax), %rax
> > -       cmovle  %rcx, %rax
> > -       VZEROUPPER_RETURN
> > -
> > -L(null):
> > -       xorl    %eax, %eax
> > -       ret
> > -# endif
> > -       .p2align 4
> > -L(cross_page_boundary):
> > -       /* Save pointer before aligning as its original value is
> > -          necessary for computer return address if byte is found or
> > -          adjusting length if it is not and this is memchr.  */
> > -       movq    %rdi, %rcx
> > -       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
> > -          and rdi for rawmemchr.  */
> > -       orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
> > -       VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > -# ifndef USE_AS_RAWMEMCHR
> > -       /* Calculate length until end of page (length checked for a
> > -          match).  */
> > -       leaq    1(%ALGN_PTR_REG), %rsi
> > -       subq    %RRAW_PTR_REG, %rsi
> > -#  ifdef USE_AS_WMEMCHR
> > -       /* NB: Divide bytes by 4 to get wchar_t count.  */
> > -       shrl    $2, %esi
> > -#  endif
> > -# endif
> > -       /* Remove the leading bytes.  */
> > -       sarxl   %ERAW_PTR_REG, %eax, %eax
> > -# ifndef USE_AS_RAWMEMCHR
> > -       /* Check the end of data.  */
> > -       cmpq    %rsi, %rdx
> > -       jbe     L(first_vec_x0)
> > +    jle  L(null)
> > +       addq    %rdi, %rax
> > +    ret
> >  # endif
> > -       testl   %eax, %eax
> > -       jz      L(cross_page_continue)
> > -       tzcntl  %eax, %eax
> > -       addq    %RRAW_PTR_REG, %rax
> > -L(return_vzeroupper):
> > -       ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> > -       .p2align 4
> > +       .p2align 4,, 10
> >  L(first_vec_x1):
> > -       tzcntl  %eax, %eax
> > +       bsfl    %eax, %eax
> >         incq    %rdi
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> > -
> > +# ifndef USE_AS_RAWMEMCHR
> > +       /* First in aligning bytes here.  */
> > +L(null):
> > +       xorl    %eax, %eax
> > +       ret
> > +# endif
> >         .p2align 4
> >  L(first_vec_x2):
> >         tzcntl  %eax, %eax
> > @@ -340,7 +315,7 @@ L(first_vec_x1_check):
> >         incq    %rdi
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> > -       .p2align 4
> > +       .p2align 4,, 6
> >  L(set_zero_end):
> >         xorl    %eax, %eax
> >         VZEROUPPER_RETURN
> > @@ -428,5 +403,39 @@ L(last_vec_x3):
> >         VZEROUPPER_RETURN
> >  # endif
> >
> > +       .p2align 4
> > +L(cross_page_boundary):
> > +       /* Save pointer before aligning as its original value is necessary for
> > +          computer return address if byte is found or adjusting length if it
> > +          is not and this is memchr.  */
> > +       movq    %rdi, %rcx
> > +       /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
> > +          rawmemchr.  */
> > +       andq    $-VEC_SIZE, %ALGN_PTR_REG
> > +       VPCMPEQ (%ALGN_PTR_REG), %ymm0, %ymm1
> > +       vpmovmskb %ymm1, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > +       /* Calculate length until end of page (length checked for a match).  */
> > +       leal    VEC_SIZE(%ALGN_PTR_REG), %esi
> > +       subl    %ERAW_PTR_REG, %esi
> > +# ifdef USE_AS_WMEMCHR
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> > +       shrl    $2, %esi
> > +# endif
> > +# endif
> > +       /* Remove the leading bytes.  */
> > +       sarxl   %ERAW_PTR_REG, %eax, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > +       /* Check the end of data.  */
> > +       cmpq    %rsi, %rdx
> > +       jbe     L(first_vec_x0)
> > +# endif
> > +       testl   %eax, %eax
> > +       jz      L(cross_page_continue)
> > +       bsfl    %eax, %eax
> > +       addq    %RRAW_PTR_REG, %rax
> > +       VZEROUPPER_RETURN
> > +
> > +
> >  END (MEMCHR)
> >  #endif
> > --
> > 2.34.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
  
Noah Goldstein July 14, 2022, 2:41 a.m. UTC | #3
On Wed, Jul 13, 2022 at 7:32 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Tue, Jun 7, 2022 at 11:19 AM H.J. Lu via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > This is not meant as a performance optimization. The previous code was
> > > far to liberal in aligning targets and wasted code size unnecissarily.
> > >
> > > The total code size saving is: 59 bytes
> > >
> > > There are no major changes in the benchmarks.
> > > Geometric Mean of all benchmarks New / Old: 0.967
> > >
> > > Full xcheck passes on x86_64.
> > > ---
> > >  sysdeps/x86_64/multiarch/memchr-avx2-rtm.S |   1 +
> > >  sysdeps/x86_64/multiarch/memchr-avx2.S     | 109 +++++++++++----------
> > >  2 files changed, 60 insertions(+), 50 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> > > index 87b076c7c4..c4d71938c5 100644
> > > --- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> > > +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
> > > @@ -2,6 +2,7 @@
> > >  # define MEMCHR __memchr_avx2_rtm
> > >  #endif
> > >
> > > +#define COND_VZEROUPPER        COND_VZEROUPPER_XTEST
> > >  #define ZERO_UPPER_VEC_REGISTERS_RETURN \
> > >    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > > index 75bd7262e0..28a01280ec 100644
> > > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > > @@ -57,7 +57,7 @@
> > >  # define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > >
> > >         .section SECTION(.text),"ax",@progbits
> > > -ENTRY (MEMCHR)
> > > +ENTRY_P2ALIGN (MEMCHR, 5)
> > >  # ifndef USE_AS_RAWMEMCHR
> > >         /* Check for zero length.  */
> > >  #  ifdef __ILP32__
> > > @@ -87,12 +87,14 @@ ENTRY (MEMCHR)
> > >  # endif
> > >         testl   %eax, %eax
> > >         jz      L(aligned_more)
> > > -       tzcntl  %eax, %eax
> > > +       bsfl    %eax, %eax
> > >         addq    %rdi, %rax
> > > -       VZEROUPPER_RETURN
> > > +L(return_vzeroupper):
> > > +       ZERO_UPPER_VEC_REGISTERS_RETURN
> > > +
> > >
> > >  # ifndef USE_AS_RAWMEMCHR
> > > -       .p2align 5
> > > +       .p2align 4
> > >  L(first_vec_x0):
> > >         /* Check if first match was before length.  */
> > >         tzcntl  %eax, %eax
> > > @@ -100,58 +102,31 @@ L(first_vec_x0):
> > >         /* NB: Multiply length by 4 to get byte count.  */
> > >         sall    $2, %edx
> > >  #  endif
> > > -       xorl    %ecx, %ecx
> > > +    COND_VZEROUPPER
> > > +       /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
> > > +          block. branch here as opposed to cmovcc is not that costly. Common
> > > +          usage of memchr is to check if the return was NULL (if string was
> > > +          known to contain CHAR user would use rawmemchr). This branch will be
> > > +          highly correlated with the user branch and can be used by most
> > > +          modern branch predictors to predict the user branch.  */
> > >         cmpl    %eax, %edx
> > > -       leaq    (%rdi, %rax), %rax
> > > -       cmovle  %rcx, %rax
> > > -       VZEROUPPER_RETURN
> > > -
> > > -L(null):
> > > -       xorl    %eax, %eax
> > > -       ret
> > > -# endif
> > > -       .p2align 4
> > > -L(cross_page_boundary):
> > > -       /* Save pointer before aligning as its original value is
> > > -          necessary for computer return address if byte is found or
> > > -          adjusting length if it is not and this is memchr.  */
> > > -       movq    %rdi, %rcx
> > > -       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
> > > -          and rdi for rawmemchr.  */
> > > -       orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
> > > -       VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> > > -       vpmovmskb %ymm1, %eax
> > > -# ifndef USE_AS_RAWMEMCHR
> > > -       /* Calculate length until end of page (length checked for a
> > > -          match).  */
> > > -       leaq    1(%ALGN_PTR_REG), %rsi
> > > -       subq    %RRAW_PTR_REG, %rsi
> > > -#  ifdef USE_AS_WMEMCHR
> > > -       /* NB: Divide bytes by 4 to get wchar_t count.  */
> > > -       shrl    $2, %esi
> > > -#  endif
> > > -# endif
> > > -       /* Remove the leading bytes.  */
> > > -       sarxl   %ERAW_PTR_REG, %eax, %eax
> > > -# ifndef USE_AS_RAWMEMCHR
> > > -       /* Check the end of data.  */
> > > -       cmpq    %rsi, %rdx
> > > -       jbe     L(first_vec_x0)
> > > +    jle  L(null)
> > > +       addq    %rdi, %rax
> > > +    ret
> > >  # endif
> > > -       testl   %eax, %eax
> > > -       jz      L(cross_page_continue)
> > > -       tzcntl  %eax, %eax
> > > -       addq    %RRAW_PTR_REG, %rax
> > > -L(return_vzeroupper):
> > > -       ZERO_UPPER_VEC_REGISTERS_RETURN
> > >
> > > -       .p2align 4
> > > +       .p2align 4,, 10
> > >  L(first_vec_x1):
> > > -       tzcntl  %eax, %eax
> > > +       bsfl    %eax, %eax
> > >         incq    %rdi
> > >         addq    %rdi, %rax
> > >         VZEROUPPER_RETURN
> > > -
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       /* First in aligning bytes here.  */
> > > +L(null):
> > > +       xorl    %eax, %eax
> > > +       ret
> > > +# endif
> > >         .p2align 4
> > >  L(first_vec_x2):
> > >         tzcntl  %eax, %eax
> > > @@ -340,7 +315,7 @@ L(first_vec_x1_check):
> > >         incq    %rdi
> > >         addq    %rdi, %rax
> > >         VZEROUPPER_RETURN
> > > -       .p2align 4
> > > +       .p2align 4,, 6
> > >  L(set_zero_end):
> > >         xorl    %eax, %eax
> > >         VZEROUPPER_RETURN
> > > @@ -428,5 +403,39 @@ L(last_vec_x3):
> > >         VZEROUPPER_RETURN
> > >  # endif
> > >
> > > +       .p2align 4
> > > +L(cross_page_boundary):
> > > +       /* Save pointer before aligning as its original value is necessary for
> > > +          computer return address if byte is found or adjusting length if it
> > > +          is not and this is memchr.  */
> > > +       movq    %rdi, %rcx
> > > +       /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
> > > +          rawmemchr.  */
> > > +       andq    $-VEC_SIZE, %ALGN_PTR_REG
> > > +       VPCMPEQ (%ALGN_PTR_REG), %ymm0, %ymm1
> > > +       vpmovmskb %ymm1, %eax
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       /* Calculate length until end of page (length checked for a match).  */
> > > +       leal    VEC_SIZE(%ALGN_PTR_REG), %esi
> > > +       subl    %ERAW_PTR_REG, %esi
> > > +# ifdef USE_AS_WMEMCHR
> > > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> > > +       shrl    $2, %esi
> > > +# endif
> > > +# endif
> > > +       /* Remove the leading bytes.  */
> > > +       sarxl   %ERAW_PTR_REG, %eax, %eax
> > > +# ifndef USE_AS_RAWMEMCHR
> > > +       /* Check the end of data.  */
> > > +       cmpq    %rsi, %rdx
> > > +       jbe     L(first_vec_x0)
> > > +# endif
> > > +       testl   %eax, %eax
> > > +       jz      L(cross_page_continue)
> > > +       bsfl    %eax, %eax
> > > +       addq    %RRAW_PTR_REG, %rax
> > > +       VZEROUPPER_RETURN
> > > +
> > > +
> > >  END (MEMCHR)
> > >  #endif
> > > --
> > > 2.34.1
> > >
> >
> > LGTM.
> >
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> >
> > Thanks.
> >
> > --
> > H.J.
>
> I would like to backport this patch to release branches.
> Any comments or objections?

Probably best to squash with:
https://sourceware.org/git/?p=glibc.git;a=commit;h=2c9af8421d2b4a7fcce163e7bc81a118d22fd346
>
> --Sunil
  

Patch

diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
index 87b076c7c4..c4d71938c5 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
@@ -2,6 +2,7 @@ 
 # define MEMCHR __memchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 75bd7262e0..28a01280ec 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -57,7 +57,7 @@ 
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 5)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 #  ifdef __ILP32__
@@ -87,12 +87,14 @@  ENTRY (MEMCHR)
 # endif
 	testl	%eax, %eax
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
 
 # ifndef USE_AS_RAWMEMCHR
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
 	/* Check if first match was before length.  */
 	tzcntl	%eax, %eax
@@ -100,58 +102,31 @@  L(first_vec_x0):
 	/* NB: Multiply length by 4 to get byte count.  */
 	sall	$2, %edx
 #  endif
-	xorl	%ecx, %ecx
+    COND_VZEROUPPER
+	/* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
+	   block. branch here as opposed to cmovcc is not that costly. Common
+	   usage of memchr is to check if the return was NULL (if string was
+	   known to contain CHAR user would use rawmemchr). This branch will be
+	   highly correlated with the user branch and can be used by most
+	   modern branch predictors to predict the user branch.  */
 	cmpl	%eax, %edx
-	leaq	(%rdi, %rax), %rax
-	cmovle	%rcx, %rax
-	VZEROUPPER_RETURN
-
-L(null):
-	xorl	%eax, %eax
-	ret
-# endif
-	.p2align 4
-L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is
-	   necessary for computer return address if byte is found or
-	   adjusting length if it is not and this is memchr.  */
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
-	   and rdi for rawmemchr.  */
-	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
-	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Calculate length until end of page (length checked for a
-	   match).  */
-	leaq	1(%ALGN_PTR_REG), %rsi
-	subq	%RRAW_PTR_REG, %rsi
-#  ifdef USE_AS_WMEMCHR
-	/* NB: Divide bytes by 4 to get wchar_t count.  */
-	shrl	$2, %esi
-#  endif
-# endif
-	/* Remove the leading bytes.  */
-	sarxl	%ERAW_PTR_REG, %eax, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Check the end of data.  */
-	cmpq	%rsi, %rdx
-	jbe	L(first_vec_x0)
+    jle  L(null)
+	addq	%rdi, %rax
+    ret
 # endif
-	testl	%eax, %eax
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-	addq	%RRAW_PTR_REG, %rax
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x1):
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-
+# ifndef USE_AS_RAWMEMCHR
+	/* First in aligning bytes here.  */
+L(null):
+	xorl	%eax, %eax
+	ret
+# endif
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
@@ -340,7 +315,7 @@  L(first_vec_x1_check):
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-	.p2align 4
+	.p2align 4,, 6
 L(set_zero_end):
 	xorl	%eax, %eax
 	VZEROUPPER_RETURN
@@ -428,5 +403,39 @@  L(last_vec_x3):
 	VZEROUPPER_RETURN
 # endif
 
+	.p2align 4
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is necessary for
+	   computer return address if byte is found or adjusting length if it
+	   is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
+	   rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMPEQ	(%ALGN_PTR_REG), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a match).  */
+	leal	VEC_SIZE(%ALGN_PTR_REG), %esi
+	subl	%ERAW_PTR_REG, %esi
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+# endif
+# endif
+	/* Remove the leading bytes.  */
+	sarxl	%ERAW_PTR_REG, %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Check the end of data.  */
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	bsfl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
+	VZEROUPPER_RETURN
+
+
 END (MEMCHR)
 #endif