[v1] x86: Fix page cross case in rawmemchr-avx2 [BZ #29234]

Message ID 20220608195836.3335491-1-goldstein.w.n@gmail.com
State Committed
Headers
Series [v1] x86: Fix page cross case in rawmemchr-avx2 [BZ #29234] |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Noah Goldstein June 8, 2022, 7:58 p.m. UTC
  commit 6dcbb7d95dded20153b12d76d2f4e0ef0cda4f35
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Mon Jun 6 21:11:33 2022 -0700

    x86: Shrink code size of memchr-avx2.S

Changed how the page cross case aligned string (rdi) in
rawmemchr. This was incompatible with how
`L(cross_page_continue)` expected the pointer to be aligned and
would cause rawmemchr to read data start started before the
beginning of the string. What it would read was in valid memory
but could count CHAR matches resulting in an incorrect return
value.

This commit fixes that issue by essentially reverting the changes to
the L(page_cross) case as they didn't really matter.

Test cases added and all pass with the new code (and where confirmed
to fail with the old code).
---
 string/test-rawmemchr.c                | 62 +++++++++++++++++++++++++-
 sysdeps/x86_64/multiarch/memchr-avx2.S | 16 +++----
 2 files changed, 69 insertions(+), 9 deletions(-)
  

Comments

Noah Goldstein June 8, 2022, 7:59 p.m. UTC | #1
On Wed, Jun 8, 2022 at 12:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> commit 6dcbb7d95dded20153b12d76d2f4e0ef0cda4f35
> Author: Noah Goldstein <goldstein.w.n@gmail.com>
> Date:   Mon Jun 6 21:11:33 2022 -0700
>
>     x86: Shrink code size of memchr-avx2.S
>
> Changed how the page cross case aligned string (rdi) in
> rawmemchr. This was incompatible with how
> `L(cross_page_continue)` expected the pointer to be aligned and
> would cause rawmemchr to read data start started before the
> beginning of the string. What it would read was in valid memory
> but could count CHAR matches resulting in an incorrect return
> value.
>
> This commit fixes that issue by essentially reverting the changes to
> the L(page_cross) case as they didn't really matter.
>
> Test cases added and all pass with the new code (and where confirmed
> to fail with the old code).
> ---
>  string/test-rawmemchr.c                | 62 +++++++++++++++++++++++++-
>  sysdeps/x86_64/multiarch/memchr-avx2.S | 16 +++----
>  2 files changed, 69 insertions(+), 9 deletions(-)
>
> diff --git a/string/test-rawmemchr.c b/string/test-rawmemchr.c
> index cafb75298a..ef8316d300 100644
> --- a/string/test-rawmemchr.c
> +++ b/string/test-rawmemchr.c
> @@ -50,13 +50,51 @@ do_one_test (impl_t *impl, const char *s, int c, char *exp_res)
>      }
>  }
>
> +static void
> +do_test_bz29234 (void)
> +{
> +  size_t i, j;
> +  char *ptr_start;
> +  char *buf = mmap (0, 8192, PROT_READ | PROT_WRITE,
> +                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +  if (buf == MAP_FAILED)
> +    {
> +      error (0, 0, "Failed to allocate memory for bz2934 test");
> +      ret = 1;
> +      return;
> +    }
> +
> +  memset (buf, -1, 8192);
> +
> +  ptr_start = buf + 4096 - 8;
> +
> +  /* Out of range matches before the start of a page. */
> +  memset (ptr_start - 8, 0x1, 8);
> +
> +  for (j = 0; j < 8; ++j)
> +    {
> +      for (i = 0; i < 128; ++i)
> +       {
> +         ptr_start[i + j] = 0x1;
> +
> +         FOR_EACH_IMPL (impl, 0)
> +           do_one_test (impl, (char *) (ptr_start + j), 0x1,
> +                        ptr_start + i + j);
> +
> +         ptr_start[i + j] = 0xff;
> +       }
> +    }
> +
> +  munmap (buf, 8192);
> +}
> +
>  static void
>  do_test (size_t align, size_t pos, size_t len, int seek_char)
>  {
>    size_t i;
>    char *result;
>
> -  align &= 7;
> +  align &= getpagesize () - 1;
>    if (align + len >= page_size)
>      return;
>
> @@ -114,6 +152,13 @@ do_random_tests (void)
>             }
>         }
>
> +      if (align)
> +       {
> +         p[align - 1] = seek_char;
> +         if (align > 4)
> +           p[align - 4] = seek_char;
> +       }
> +
>        assert (pos < len);
>        size_t r = random ();
>        if ((r & 31) == 0)
> @@ -129,6 +174,13 @@ do_random_tests (void)
>                    result, p);
>             ret = 1;
>           }
> +
> +      if (align)
> +       {
> +         p[align - 1] = seek_char;
> +         if (align > 4)
> +           p[align - 4] = seek_char;
> +       }
>      }
>  }
>
> @@ -150,14 +202,22 @@ test_main (void)
>        do_test (i, 64, 256, 23);
>        do_test (0, 16 << i, 2048, 0);
>        do_test (i, 64, 256, 0);
> +
> +      do_test (getpagesize () - i, 64, 256, 23);
> +      do_test (getpagesize () - i, 64, 256, 0);
>      }
>    for (i = 1; i < 32; ++i)
>      {
>        do_test (0, i, i + 1, 23);
>        do_test (0, i, i + 1, 0);
> +
> +      do_test (getpagesize () - 7, i, i + 1, 23);
> +      do_test (getpagesize () - i / 2, i, i + 1, 23);
> +      do_test (getpagesize () - i, i, i + 1, 23);
>      }
>
>    do_random_tests ();
> +  do_test_bz29234 ();
>    return ret;
>  }
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 28a01280ec..c5a256eb37 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -409,19 +409,19 @@ L(cross_page_boundary):
>            computer return address if byte is found or adjusting length if it
>            is not and this is memchr.  */
>         movq    %rdi, %rcx
> -       /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
> -          rawmemchr.  */
> -       andq    $-VEC_SIZE, %ALGN_PTR_REG
> -       VPCMPEQ (%ALGN_PTR_REG), %ymm0, %ymm1
> +       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
> +          and rdi for rawmemchr.  */
> +       orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
> +       VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
>  # ifndef USE_AS_RAWMEMCHR
>         /* Calculate length until end of page (length checked for a match).  */
> -       leal    VEC_SIZE(%ALGN_PTR_REG), %esi
> -       subl    %ERAW_PTR_REG, %esi
> -# ifdef USE_AS_WMEMCHR
> +       leaq    1(%ALGN_PTR_REG), %rsi
> +       subq    %RRAW_PTR_REG, %rsi
> +#  ifdef USE_AS_WMEMCHR
>         /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %esi
> -# endif
> +#  endif
>  # endif
>         /* Remove the leading bytes.  */
>         sarxl   %ERAW_PTR_REG, %eax, %eax
> --
> 2.34.1
>

Manual do you want to test this?
  
H.J. Lu June 8, 2022, 9:06 p.m. UTC | #2
On Wed, Jun 8, 2022 at 12:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> commit 6dcbb7d95dded20153b12d76d2f4e0ef0cda4f35
> Author: Noah Goldstein <goldstein.w.n@gmail.com>
> Date:   Mon Jun 6 21:11:33 2022 -0700
>
>     x86: Shrink code size of memchr-avx2.S
>
> Changed how the page cross case aligned string (rdi) in
> rawmemchr. This was incompatible with how
> `L(cross_page_continue)` expected the pointer to be aligned and
> would cause rawmemchr to read data start started before the
> beginning of the string. What it would read was in valid memory
> but could count CHAR matches resulting in an incorrect return
> value.
>
> This commit fixes that issue by essentially reverting the changes to
> the L(page_cross) case as they didn't really matter.
>
> Test cases added and all pass with the new code (and where confirmed
> to fail with the old code).
> ---
>  string/test-rawmemchr.c                | 62 +++++++++++++++++++++++++-
>  sysdeps/x86_64/multiarch/memchr-avx2.S | 16 +++----
>  2 files changed, 69 insertions(+), 9 deletions(-)
>
> diff --git a/string/test-rawmemchr.c b/string/test-rawmemchr.c
> index cafb75298a..ef8316d300 100644
> --- a/string/test-rawmemchr.c
> +++ b/string/test-rawmemchr.c
> @@ -50,13 +50,51 @@ do_one_test (impl_t *impl, const char *s, int c, char *exp_res)
>      }
>  }
>
> +static void
> +do_test_bz29234 (void)
> +{
> +  size_t i, j;
> +  char *ptr_start;
> +  char *buf = mmap (0, 8192, PROT_READ | PROT_WRITE,
> +                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

Please use xmmap.

> +  if (buf == MAP_FAILED)
> +    {
> +      error (0, 0, "Failed to allocate memory for bz2934 test");
> +      ret = 1;
> +      return;
> +    }
> +
> +  memset (buf, -1, 8192);
> +
> +  ptr_start = buf + 4096 - 8;
> +
> +  /* Out of range matches before the start of a page. */
> +  memset (ptr_start - 8, 0x1, 8);
> +
> +  for (j = 0; j < 8; ++j)
> +    {
> +      for (i = 0; i < 128; ++i)
> +       {
> +         ptr_start[i + j] = 0x1;
> +
> +         FOR_EACH_IMPL (impl, 0)
> +           do_one_test (impl, (char *) (ptr_start + j), 0x1,
> +                        ptr_start + i + j);
> +
> +         ptr_start[i + j] = 0xff;
> +       }
> +    }
> +
> +  munmap (buf, 8192);
> +}
> +
>  static void
>  do_test (size_t align, size_t pos, size_t len, int seek_char)
>  {
>    size_t i;
>    char *result;
>
> -  align &= 7;
> +  align &= getpagesize () - 1;
>    if (align + len >= page_size)
>      return;
>
> @@ -114,6 +152,13 @@ do_random_tests (void)
>             }
>         }
>
> +      if (align)
> +       {
> +         p[align - 1] = seek_char;
> +         if (align > 4)
> +           p[align - 4] = seek_char;
> +       }
> +
>        assert (pos < len);
>        size_t r = random ();
>        if ((r & 31) == 0)
> @@ -129,6 +174,13 @@ do_random_tests (void)
>                    result, p);
>             ret = 1;
>           }
> +
> +      if (align)
> +       {
> +         p[align - 1] = seek_char;
> +         if (align > 4)
> +           p[align - 4] = seek_char;
> +       }
>      }
>  }
>
> @@ -150,14 +202,22 @@ test_main (void)
>        do_test (i, 64, 256, 23);
>        do_test (0, 16 << i, 2048, 0);
>        do_test (i, 64, 256, 0);
> +
> +      do_test (getpagesize () - i, 64, 256, 23);
> +      do_test (getpagesize () - i, 64, 256, 0);
>      }
>    for (i = 1; i < 32; ++i)
>      {
>        do_test (0, i, i + 1, 23);
>        do_test (0, i, i + 1, 0);
> +
> +      do_test (getpagesize () - 7, i, i + 1, 23);
> +      do_test (getpagesize () - i / 2, i, i + 1, 23);
> +      do_test (getpagesize () - i, i, i + 1, 23);
>      }
>
>    do_random_tests ();
> +  do_test_bz29234 ();
>    return ret;
>  }
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 28a01280ec..c5a256eb37 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -409,19 +409,19 @@ L(cross_page_boundary):
>            computer return address if byte is found or adjusting length if it
>            is not and this is memchr.  */
>         movq    %rdi, %rcx
> -       /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
> -          rawmemchr.  */
> -       andq    $-VEC_SIZE, %ALGN_PTR_REG
> -       VPCMPEQ (%ALGN_PTR_REG), %ymm0, %ymm1
> +       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
> +          and rdi for rawmemchr.  */
> +       orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
> +       VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
>  # ifndef USE_AS_RAWMEMCHR
>         /* Calculate length until end of page (length checked for a match).  */
> -       leal    VEC_SIZE(%ALGN_PTR_REG), %esi
> -       subl    %ERAW_PTR_REG, %esi
> -# ifdef USE_AS_WMEMCHR
> +       leaq    1(%ALGN_PTR_REG), %rsi
> +       subq    %RRAW_PTR_REG, %rsi
> +#  ifdef USE_AS_WMEMCHR
>         /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %esi
> -# endif
> +#  endif
>  # endif
>         /* Remove the leading bytes.  */
>         sarxl   %ERAW_PTR_REG, %eax, %eax
> --
> 2.34.1
>
  
Noah Goldstein June 8, 2022, 9:35 p.m. UTC | #3
On Wed, Jun 8, 2022 at 2:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Jun 8, 2022 at 12:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > commit 6dcbb7d95dded20153b12d76d2f4e0ef0cda4f35
> > Author: Noah Goldstein <goldstein.w.n@gmail.com>
> > Date:   Mon Jun 6 21:11:33 2022 -0700
> >
> >     x86: Shrink code size of memchr-avx2.S
> >
> > Changed how the page cross case aligned string (rdi) in
> > rawmemchr. This was incompatible with how
> > `L(cross_page_continue)` expected the pointer to be aligned and
> > would cause rawmemchr to read data start started before the
> > beginning of the string. What it would read was in valid memory
> > but could count CHAR matches resulting in an incorrect return
> > value.
> >
> > This commit fixes that issue by essentially reverting the changes to
> > the L(page_cross) case as they didn't really matter.
> >
> > Test cases added and all pass with the new code (and where confirmed
> > to fail with the old code).
> > ---
> >  string/test-rawmemchr.c                | 62 +++++++++++++++++++++++++-
> >  sysdeps/x86_64/multiarch/memchr-avx2.S | 16 +++----
> >  2 files changed, 69 insertions(+), 9 deletions(-)
> >
> > diff --git a/string/test-rawmemchr.c b/string/test-rawmemchr.c
> > index cafb75298a..ef8316d300 100644
> > --- a/string/test-rawmemchr.c
> > +++ b/string/test-rawmemchr.c
> > @@ -50,13 +50,51 @@ do_one_test (impl_t *impl, const char *s, int c, char *exp_res)
> >      }
> >  }
> >
> > +static void
> > +do_test_bz29234 (void)
> > +{
> > +  size_t i, j;
> > +  char *ptr_start;
> > +  char *buf = mmap (0, 8192, PROT_READ | PROT_WRITE,
> > +                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>
> Please use xmmap.

Fixed in V2.
>
> > +  if (buf == MAP_FAILED)
> > +    {
> > +      error (0, 0, "Failed to allocate memory for bz2934 test");
> > +      ret = 1;
> > +      return;
> > +    }
> > +
> > +  memset (buf, -1, 8192);
> > +
> > +  ptr_start = buf + 4096 - 8;
> > +
> > +  /* Out of range matches before the start of a page. */
> > +  memset (ptr_start - 8, 0x1, 8);
> > +
> > +  for (j = 0; j < 8; ++j)
> > +    {
> > +      for (i = 0; i < 128; ++i)
> > +       {
> > +         ptr_start[i + j] = 0x1;
> > +
> > +         FOR_EACH_IMPL (impl, 0)
> > +           do_one_test (impl, (char *) (ptr_start + j), 0x1,
> > +                        ptr_start + i + j);
> > +
> > +         ptr_start[i + j] = 0xff;
> > +       }
> > +    }
> > +
> > +  munmap (buf, 8192);
> > +}
> > +
> >  static void
> >  do_test (size_t align, size_t pos, size_t len, int seek_char)
> >  {
> >    size_t i;
> >    char *result;
> >
> > -  align &= 7;
> > +  align &= getpagesize () - 1;
> >    if (align + len >= page_size)
> >      return;
> >
> > @@ -114,6 +152,13 @@ do_random_tests (void)
> >             }
> >         }
> >
> > +      if (align)
> > +       {
> > +         p[align - 1] = seek_char;
> > +         if (align > 4)
> > +           p[align - 4] = seek_char;
> > +       }
> > +
> >        assert (pos < len);
> >        size_t r = random ();
> >        if ((r & 31) == 0)
> > @@ -129,6 +174,13 @@ do_random_tests (void)
> >                    result, p);
> >             ret = 1;
> >           }
> > +
> > +      if (align)
> > +       {
> > +         p[align - 1] = seek_char;
> > +         if (align > 4)
> > +           p[align - 4] = seek_char;
> > +       }
> >      }
> >  }
> >
> > @@ -150,14 +202,22 @@ test_main (void)
> >        do_test (i, 64, 256, 23);
> >        do_test (0, 16 << i, 2048, 0);
> >        do_test (i, 64, 256, 0);
> > +
> > +      do_test (getpagesize () - i, 64, 256, 23);
> > +      do_test (getpagesize () - i, 64, 256, 0);
> >      }
> >    for (i = 1; i < 32; ++i)
> >      {
> >        do_test (0, i, i + 1, 23);
> >        do_test (0, i, i + 1, 0);
> > +
> > +      do_test (getpagesize () - 7, i, i + 1, 23);
> > +      do_test (getpagesize () - i / 2, i, i + 1, 23);
> > +      do_test (getpagesize () - i, i, i + 1, 23);
> >      }
> >
> >    do_random_tests ();
> > +  do_test_bz29234 ();
> >    return ret;
> >  }
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > index 28a01280ec..c5a256eb37 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > @@ -409,19 +409,19 @@ L(cross_page_boundary):
> >            computer return address if byte is found or adjusting length if it
> >            is not and this is memchr.  */
> >         movq    %rdi, %rcx
> > -       /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
> > -          rawmemchr.  */
> > -       andq    $-VEC_SIZE, %ALGN_PTR_REG
> > -       VPCMPEQ (%ALGN_PTR_REG), %ymm0, %ymm1
> > +       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
> > +          and rdi for rawmemchr.  */
> > +       orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
> > +       VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> >         vpmovmskb %ymm1, %eax
> >  # ifndef USE_AS_RAWMEMCHR
> >         /* Calculate length until end of page (length checked for a match).  */
> > -       leal    VEC_SIZE(%ALGN_PTR_REG), %esi
> > -       subl    %ERAW_PTR_REG, %esi
> > -# ifdef USE_AS_WMEMCHR
> > +       leaq    1(%ALGN_PTR_REG), %rsi
> > +       subq    %RRAW_PTR_REG, %rsi
> > +#  ifdef USE_AS_WMEMCHR
> >         /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrl    $2, %esi
> > -# endif
> > +#  endif
> >  # endif
> >         /* Remove the leading bytes.  */
> >         sarxl   %ERAW_PTR_REG, %eax, %eax
> > --
> > 2.34.1
> >
>
>
> --
> H.J.
  

Patch

diff --git a/string/test-rawmemchr.c b/string/test-rawmemchr.c
index cafb75298a..ef8316d300 100644
--- a/string/test-rawmemchr.c
+++ b/string/test-rawmemchr.c
@@ -50,13 +50,51 @@  do_one_test (impl_t *impl, const char *s, int c, char *exp_res)
     }
 }
 
+static void
+do_test_bz29234 (void)
+{
+  size_t i, j;
+  char *ptr_start;
+  char *buf = mmap (0, 8192, PROT_READ | PROT_WRITE,
+		    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (buf == MAP_FAILED)
+    {
+      error (0, 0, "Failed to allocate memory for bz2934 test");
+      ret = 1;
+      return;
+    }
+
+  memset (buf, -1, 8192);
+
+  ptr_start = buf + 4096 - 8;
+
+  /* Out of range matches before the start of a page. */
+  memset (ptr_start - 8, 0x1, 8);
+
+  for (j = 0; j < 8; ++j)
+    {
+      for (i = 0; i < 128; ++i)
+	{
+	  ptr_start[i + j] = 0x1;
+
+	  FOR_EACH_IMPL (impl, 0)
+	    do_one_test (impl, (char *) (ptr_start + j), 0x1,
+			 ptr_start + i + j);
+
+	  ptr_start[i + j] = 0xff;
+	}
+    }
+
+  munmap (buf, 8192);
+}
+
 static void
 do_test (size_t align, size_t pos, size_t len, int seek_char)
 {
   size_t i;
   char *result;
 
-  align &= 7;
+  align &= getpagesize () - 1;
   if (align + len >= page_size)
     return;
 
@@ -114,6 +152,13 @@  do_random_tests (void)
 	    }
 	}
 
+      if (align)
+	{
+	  p[align - 1] = seek_char;
+	  if (align > 4)
+	    p[align - 4] = seek_char;
+	}
+
       assert (pos < len);
       size_t r = random ();
       if ((r & 31) == 0)
@@ -129,6 +174,13 @@  do_random_tests (void)
 		   result, p);
 	    ret = 1;
 	  }
+
+      if (align)
+	{
+	  p[align - 1] = seek_char;
+	  if (align > 4)
+	    p[align - 4] = seek_char;
+	}
     }
 }
 
@@ -150,14 +202,22 @@  test_main (void)
       do_test (i, 64, 256, 23);
       do_test (0, 16 << i, 2048, 0);
       do_test (i, 64, 256, 0);
+
+      do_test (getpagesize () - i, 64, 256, 23);
+      do_test (getpagesize () - i, 64, 256, 0);
     }
   for (i = 1; i < 32; ++i)
     {
       do_test (0, i, i + 1, 23);
       do_test (0, i, i + 1, 0);
+
+      do_test (getpagesize () - 7, i, i + 1, 23);
+      do_test (getpagesize () - i / 2, i, i + 1, 23);
+      do_test (getpagesize () - i, i, i + 1, 23);
     }
 
   do_random_tests ();
+  do_test_bz29234 ();
   return ret;
 }
 
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 28a01280ec..c5a256eb37 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -409,19 +409,19 @@  L(cross_page_boundary):
 	   computer return address if byte is found or adjusting length if it
 	   is not and this is memchr.  */
 	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for
-	   rawmemchr.  */
-	andq	$-VEC_SIZE, %ALGN_PTR_REG
-	VPCMPEQ	(%ALGN_PTR_REG), %ymm0, %ymm1
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+	   and rdi for rawmemchr.  */
+	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* Calculate length until end of page (length checked for a match).  */
-	leal	VEC_SIZE(%ALGN_PTR_REG), %esi
-	subl	%ERAW_PTR_REG, %esi
-# ifdef USE_AS_WMEMCHR
+	leaq	1(%ALGN_PTR_REG), %rsi
+	subq	%RRAW_PTR_REG, %rsi
+#  ifdef USE_AS_WMEMCHR
 	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %esi
-# endif
+#  endif
 # endif
 	/* Remove the leading bytes.  */
 	sarxl	%ERAW_PTR_REG, %eax, %eax