diff mbox series

[v2,3/3] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]

Message ID 20210622181111.185897-3-goldstein.w.n@gmail.com
State Committed
Headers show
Series None | expand

Commit Message

Noah Goldstein June 22, 2021, 6:11 p.m. UTC
This commit fixes the bug mentioned in the previous commit.

The previous implementations of wmemchr in these files relied
on maxlen * sizeof(wchar_t) which was not guranteed by the standard.

The new overflow tests added in the previous commit now
pass (As well as all the other tests).

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
 sysdeps/x86_64/strlen.S                |  14 ++-
 2 files changed, 106 insertions(+), 38 deletions(-)

Comments

H.J. Lu June 22, 2021, 9:33 p.m. UTC | #1
On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit fixes the bug mentioned in the previous commit.
>
> The previous implementations of wmemchr in these files relied
> on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
>
> The new overflow tests added in the previous commit now
> pass (As well as all the other tests).
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
>  sysdeps/x86_64/strlen.S                |  14 ++-
>  2 files changed, 106 insertions(+), 38 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> index bd2e6ee44a..b282a75613 100644
> --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> @@ -44,21 +44,21 @@
>
>  # define VEC_SIZE 32
>  # define PAGE_SIZE 4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
>         .section SECTION(.text),"ax",@progbits
>  ENTRY (STRLEN)
>  # ifdef USE_AS_STRNLEN
>         /* Check zero length.  */
> +#  ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       and     %RSI_LP, %RSI_LP
> +#  else
>         test    %RSI_LP, %RSI_LP
> +#  endif
>         jz      L(zero)
>         /* Store max len in R8_LP before adjusting if using WCSLEN.  */
>         mov     %RSI_LP, %R8_LP
> -#  ifdef USE_AS_WCSLEN
> -       shl     $2, %RSI_LP
> -#  elif defined __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %esi, %esi
> -#  endif
>  # endif
>         movl    %edi, %eax
>         movq    %rdi, %rdx
> @@ -72,10 +72,10 @@ ENTRY (STRLEN)
>
>         /* Check the first VEC_SIZE bytes.  */
>         VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>  # ifdef USE_AS_STRNLEN
>         /* If length < VEC_SIZE handle special.  */
> -       cmpq    $VEC_SIZE, %rsi
> +       cmpq    $CHAR_PER_VEC, %rsi
>         jbe     L(first_vec_x0)
>  # endif
>         /* If empty continue to aligned_more. Otherwise return bit
> @@ -84,6 +84,7 @@ ENTRY (STRLEN)
>         jz      L(aligned_more)
>         tzcntl  %eax, %eax
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -97,9 +98,14 @@ L(zero):
>  L(first_vec_x0):
>         /* Set bit for max len so that tzcnt will return min of max len
>            and position of first match.  */
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %esi
> +#  endif
>         btsq    %rsi, %rax
>         tzcntl  %eax, %eax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -113,14 +119,19 @@ L(first_vec_x1):
>  # ifdef USE_AS_STRNLEN
>         /* Use ecx which was computed earlier to compute correct value.
>          */
> +#  ifdef USE_AS_WCSLEN
> +       leal    -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
> +#  else
>         subl    $(VEC_SIZE * 4 + 1), %ecx
>         addl    %ecx, %eax
> +#  endif
>  # else
>         subl    %edx, %edi
>         incl    %edi
>         addl    %edi, %eax
>  # endif
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -133,14 +144,19 @@ L(first_vec_x2):
>  # ifdef USE_AS_STRNLEN
>         /* Use ecx which was computed earlier to compute correct value.
>          */
> +#  ifdef USE_AS_WCSLEN
> +       leal    -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
> +#  else
>         subl    $(VEC_SIZE * 3 + 1), %ecx
>         addl    %ecx, %eax
> +#  endif
>  # else
>         subl    %edx, %edi
>         addl    $(VEC_SIZE + 1), %edi
>         addl    %edi, %eax
>  # endif
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -153,14 +169,19 @@ L(first_vec_x3):
>  # ifdef USE_AS_STRNLEN
>         /* Use ecx which was computed earlier to compute correct value.
>          */
> +#  ifdef USE_AS_WCSLEN
> +       leal    -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
> +#  else
>         subl    $(VEC_SIZE * 2 + 1), %ecx
>         addl    %ecx, %eax
> +#  endif
>  # else
>         subl    %edx, %edi
>         addl    $(VEC_SIZE * 2 + 1), %edi
>         addl    %edi, %eax
>  # endif
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -173,14 +194,19 @@ L(first_vec_x4):
>  # ifdef USE_AS_STRNLEN
>         /* Use ecx which was computed earlier to compute correct value.
>          */
> +#  ifdef USE_AS_WCSLEN
> +       leal    -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
> +#  else
>         subl    $(VEC_SIZE + 1), %ecx
>         addl    %ecx, %eax
> +#  endif
>  # else
>         subl    %edx, %edi
>         addl    $(VEC_SIZE * 3 + 1), %edi
>         addl    %edi, %eax
>  # endif
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  # endif
>         VZEROUPPER_RETURN
> @@ -195,10 +221,14 @@ L(cross_page_continue):
>         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>            since data is only aligned to VEC_SIZE.  */
>  # ifdef USE_AS_STRNLEN
> -       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> -          it simplies the logic in last_4x_vec_or_less.  */
> +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> +          because it simplies the logic in last_4x_vec_or_less.  */
>         leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
>         subq    %rdx, %rcx
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %ecx
> +#  endif
>  # endif
>         /* Load first VEC regardless.  */
>         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> @@ -207,34 +237,38 @@ L(cross_page_continue):
>         subq    %rcx, %rsi
>         jb      L(last_4x_vec_or_less)
>  # endif
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x1)
>
>         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x2)
>
>         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x3)
>
>         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x4)
>
>         /* Align data to VEC_SIZE * 4 - 1.  */
>  # ifdef USE_AS_STRNLEN
>         /* Before adjusting length check if at last VEC_SIZE * 4.  */
> -       cmpq    $(VEC_SIZE * 4 - 1), %rsi
> +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
>         jbe     L(last_4x_vec_or_less_load)
>         incq    %rdi
>         movl    %edi, %ecx
>         orq     $(VEC_SIZE * 4 - 1), %rdi
>         andl    $(VEC_SIZE * 4 - 1), %ecx
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %ecx
> +#  endif
>         /* Readjust length.  */
>         addq    %rcx, %rsi
>  # else
> @@ -246,13 +280,13 @@ L(cross_page_continue):
>  L(loop_4x_vec):
>  # ifdef USE_AS_STRNLEN
>         /* Break if at end of length.  */
> -       subq    $(VEC_SIZE * 4), %rsi
> +       subq    $(CHAR_PER_VEC * 4), %rsi
>         jb      L(last_4x_vec_or_less_cmpeq)
>  # endif
> -       /* Save some code size by microfusing VPMINU with the load. Since
> -          the matches in ymm2/ymm4 can only be returned if there where no
> -          matches in ymm1/ymm3 respectively there is no issue with overlap.
> -        */
> +       /* Save some code size by microfusing VPMINU with the load.
> +          Since the matches in ymm2/ymm4 can only be returned if there
> +          where no matches in ymm1/ymm3 respectively there is no issue
> +          with overlap.  */
>         vmovdqa 1(%rdi), %ymm1
>         VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
>         vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> @@ -260,7 +294,7 @@ L(loop_4x_vec):
>
>         VPMINU  %ymm2, %ymm4, %ymm5
>         VPCMPEQ %ymm5, %ymm0, %ymm5
> -       vpmovmskb       %ymm5, %ecx
> +       vpmovmskb %ymm5, %ecx
>
>         subq    $-(VEC_SIZE * 4), %rdi
>         testl   %ecx, %ecx
> @@ -268,27 +302,28 @@ L(loop_4x_vec):
>
>
>         VPCMPEQ %ymm1, %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         subq    %rdx, %rdi
>         testl   %eax, %eax
>         jnz     L(last_vec_return_x0)
>
>         VPCMPEQ %ymm2, %ymm0, %ymm2
> -       vpmovmskb       %ymm2, %eax
> +       vpmovmskb %ymm2, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_return_x1)
>
>         /* Combine last 2 VEC.  */
>         VPCMPEQ %ymm3, %ymm0, %ymm3
> -       vpmovmskb       %ymm3, %eax
> -       /* rcx has combined result from all 4 VEC. It will only be used if
> -          the first 3 other VEC all did not contain a match.  */
> +       vpmovmskb %ymm3, %eax
> +       /* rcx has combined result from all 4 VEC. It will only be used
> +          if the first 3 other VEC all did not contain a match.  */
>         salq    $32, %rcx
>         orq     %rcx, %rax
>         tzcntq  %rax, %rax
>         subq    $(VEC_SIZE * 2 - 1), %rdi
>         addq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  # endif
>         VZEROUPPER_RETURN
> @@ -297,15 +332,19 @@ L(loop_4x_vec):
>  # ifdef USE_AS_STRNLEN
>         .p2align 4
>  L(last_4x_vec_or_less_load):
> -       /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> +       /* Depending on entry adjust rdi / prepare first VEC in ymm1.
> +        */
>         subq    $-(VEC_SIZE * 4), %rdi
>  L(last_4x_vec_or_less_cmpeq):
>         VPCMPEQ 1(%rdi), %ymm0, %ymm1
>  L(last_4x_vec_or_less):
> -
> -       vpmovmskb       %ymm1, %eax
> -       /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> -          VEC_SIZE * 4.  */
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %esi
> +#  endif
> +       vpmovmskb %ymm1, %eax
> +       /* If remaining length > VEC_SIZE * 2. This works if esi is off
> +          by VEC_SIZE * 4.  */
>         testl   $(VEC_SIZE * 2), %esi
>         jnz     L(last_4x_vec)
>
> @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
>         jb      L(max)
>
>         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         tzcntl  %eax, %eax
>         /* Check the end of data.  */
>         cmpl    %eax, %esi
> @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
>         addl    $(VEC_SIZE + 1), %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -340,6 +380,7 @@ L(last_vec_return_x0):
>         subq    $(VEC_SIZE * 4 - 1), %rdi
>         addq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  # endif
>         VZEROUPPER_RETURN
> @@ -350,6 +391,7 @@ L(last_vec_return_x1):
>         subq    $(VEC_SIZE * 3 - 1), %rdi
>         addq    %rdi, %rax
>  # ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  # endif
>         VZEROUPPER_RETURN
> @@ -366,6 +408,7 @@ L(last_vec_x1_check):
>         incl    %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -381,14 +424,14 @@ L(last_4x_vec):
>         jnz     L(last_vec_x1)
>
>         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_x2)
>
>         /* Normalize length.  */
>         andl    $(VEC_SIZE * 4 - 1), %esi
>         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_x3)
>
> @@ -396,7 +439,7 @@ L(last_4x_vec):
>         jb      L(max)
>
>         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         tzcntl  %eax, %eax
>         /* Check the end of data.  */
>         cmpl    %eax, %esi
> @@ -405,6 +448,7 @@ L(last_4x_vec):
>         addl    $(VEC_SIZE * 3 + 1), %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -419,6 +463,7 @@ L(last_vec_x1):
>         incl    %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -432,6 +477,7 @@ L(last_vec_x2):
>         addl    $(VEC_SIZE + 1), %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -447,6 +493,7 @@ L(last_vec_x3):
>         addl    $(VEC_SIZE * 2 + 1), %eax
>         addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> @@ -455,13 +502,13 @@ L(max_end):
>         VZEROUPPER_RETURN
>  # endif
>
> -       /* Cold case for crossing page with first load.  */
> +       /* Cold case for crossing page with first load.  */
>         .p2align 4
>  L(cross_page_boundary):
>         /* Align data to VEC_SIZE - 1.  */
>         orq     $(VEC_SIZE - 1), %rdi
>         VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> -       vpmovmskb       %ymm1, %eax
> +       vpmovmskb %ymm1, %eax
>         /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
>            so no need to manually mod rdx.  */
>         sarxl   %edx, %eax, %eax
> @@ -470,6 +517,10 @@ L(cross_page_boundary):
>         jnz     L(cross_page_less_vec)
>         leaq    1(%rdi), %rcx
>         subq    %rdx, %rcx
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> +       shrl    $2, %ecx
> +#  endif
>         /* Check length.  */
>         cmpq    %rsi, %rcx
>         jb      L(cross_page_continue)
> @@ -479,6 +530,7 @@ L(cross_page_boundary):
>         jz      L(cross_page_continue)
>         tzcntl  %eax, %eax
>  #  ifdef USE_AS_WCSLEN
> +       /* NB: Divide length by 4 to get wchar_t count.  */
>         shrl    $2, %eax
>  #  endif
>  # endif
> @@ -489,6 +541,10 @@ L(return_vzeroupper):
>         .p2align 4
>  L(cross_page_less_vec):
>         tzcntl  %eax, %eax
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Multiply length by 4 to get byte count.  */
> +       sall    $2, %esi
> +#  endif
>         cmpq    %rax, %rsi
>         cmovb   %esi, %eax
>  #  ifdef USE_AS_WCSLEN
> diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> index d223ea1700..3fc6734910 100644
> --- a/sysdeps/x86_64/strlen.S
> +++ b/sysdeps/x86_64/strlen.S
> @@ -65,12 +65,24 @@ ENTRY(strlen)
>         ret
>  L(n_nonzero):
>  # ifdef AS_WCSLEN
> -       shl     $2, %RSI_LP
> +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> +   overflow the only way this program doesn't have undefined behavior
> +   is if there is a null terminator in valid memory so strlen will
> +   suffice.  */
> +       mov     %RSI_LP, %R10_LP
> +       sar     $62, %R10_LP
> +       test    %R10_LP, %R10_LP
> +       jnz     __wcslen_sse2

Branch to  __wcslen_sse2 is wrong for 2 reasons:

1.  __wcslen_sse2 is undefined with --disable-multi-arch.
2. You should skip ENDBR64 at function entry.

Please create a new label and branch to it.

> +       sal     $2, %RSI_LP
>  # endif
>
>  /* Initialize long lived registers.  */
>
>         add     %RDI_LP, %RSI_LP
> +# ifdef AS_WCSLEN
> +/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
> +       jbe     __wcslen_sse2
> +# endif
>         mov     %RSI_LP, %R10_LP
>         and     $-64, %R10_LP
>         mov     %RSI_LP, %R11_LP
> --
> 2.25.1
>

Thanks.


--
H.J.
Noah Goldstein June 22, 2021, 11:16 p.m. UTC | #2
On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> > This commit fixes the bug mentioned in the previous commit.
> >
> > The previous implementations of wmemchr in these files relied
> > on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
> >
> > The new overflow tests added in the previous commit now
> > pass (As well as all the other tests).
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> >  sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
> >  sysdeps/x86_64/strlen.S                |  14 ++-
> >  2 files changed, 106 insertions(+), 38 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S
> b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > index bd2e6ee44a..b282a75613 100644
> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > @@ -44,21 +44,21 @@
> >
> >  # define VEC_SIZE 32
> >  # define PAGE_SIZE 4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> >
> >         .section SECTION(.text),"ax",@progbits
> >  ENTRY (STRLEN)
> >  # ifdef USE_AS_STRNLEN
> >         /* Check zero length.  */
> > +#  ifdef __ILP32__
> > +       /* Clear upper bits.  */
> > +       and     %RSI_LP, %RSI_LP
> > +#  else
> >         test    %RSI_LP, %RSI_LP
> > +#  endif
> >         jz      L(zero)
> >         /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> >         mov     %RSI_LP, %R8_LP
> > -#  ifdef USE_AS_WCSLEN
> > -       shl     $2, %RSI_LP
> > -#  elif defined __ILP32__
> > -       /* Clear the upper 32 bits.  */
> > -       movl    %esi, %esi
> > -#  endif
> >  # endif
> >         movl    %edi, %eax
> >         movq    %rdi, %rdx
> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
> >
> >         /* Check the first VEC_SIZE bytes.  */
> >         VPCMPEQ (%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >  # ifdef USE_AS_STRNLEN
> >         /* If length < VEC_SIZE handle special.  */
> > -       cmpq    $VEC_SIZE, %rsi
> > +       cmpq    $CHAR_PER_VEC, %rsi
> >         jbe     L(first_vec_x0)
> >  # endif
> >         /* If empty continue to aligned_more. Otherwise return bit
> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
> >         jz      L(aligned_more)
> >         tzcntl  %eax, %eax
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrl    $2, %eax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -97,9 +98,14 @@ L(zero):
> >  L(first_vec_x0):
> >         /* Set bit for max len so that tzcnt will return min of max len
> >            and position of first match.  */
> > +#  ifdef USE_AS_WCSLEN
> > +       /* NB: Multiply length by 4 to get byte count.  */
> > +       sall    $2, %esi
> > +#  endif
> >         btsq    %rsi, %rax
> >         tzcntl  %eax, %eax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrl    $2, %eax
> >  #  endif
> >         VZEROUPPER_RETURN
> > @@ -113,14 +119,19 @@ L(first_vec_x1):
> >  # ifdef USE_AS_STRNLEN
> >         /* Use ecx which was computed earlier to compute correct value.
> >          */
> > +#  ifdef USE_AS_WCSLEN
> > +       leal    -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
> > +#  else
> >         subl    $(VEC_SIZE * 4 + 1), %ecx
> >         addl    %ecx, %eax
> > +#  endif
> >  # else
> >         subl    %edx, %edi
> >         incl    %edi
> >         addl    %edi, %eax
> >  # endif
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrl    $2, %eax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -133,14 +144,19 @@ L(first_vec_x2):
> >  # ifdef USE_AS_STRNLEN
> >         /* Use ecx which was computed earlier to compute correct value.
> >          */
> > +#  ifdef USE_AS_WCSLEN
> > +       leal    -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
> > +#  else
> >         subl    $(VEC_SIZE * 3 + 1), %ecx
> >         addl    %ecx, %eax
> > +#  endif
> >  # else
> >         subl    %edx, %edi
> >         addl    $(VEC_SIZE + 1), %edi
> >         addl    %edi, %eax
> >  # endif
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrl    $2, %eax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -153,14 +169,19 @@ L(first_vec_x3):
> >  # ifdef USE_AS_STRNLEN
> >         /* Use ecx which was computed earlier to compute correct value.
> >          */
> > +#  ifdef USE_AS_WCSLEN
> > +       leal    -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
> > +#  else
> >         subl    $(VEC_SIZE * 2 + 1), %ecx
> >         addl    %ecx, %eax
> > +#  endif
> >  # else
> >         subl    %edx, %edi
> >         addl    $(VEC_SIZE * 2 + 1), %edi
> >         addl    %edi, %eax
> >  # endif
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrl    $2, %eax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -173,14 +194,19 @@ L(first_vec_x4):
> >  # ifdef USE_AS_STRNLEN
> >         /* Use ecx which was computed earlier to compute correct value.
> >          */
> > +#  ifdef USE_AS_WCSLEN
> > +       leal    -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
> > +#  else
> >         subl    $(VEC_SIZE + 1), %ecx
> >         addl    %ecx, %eax
> > +#  endif
> >  # else
> >         subl    %edx, %edi
> >         addl    $(VEC_SIZE * 3 + 1), %edi
> >         addl    %edi, %eax
> >  # endif
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrl    $2, %eax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -195,10 +221,14 @@ L(cross_page_continue):
> >         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> >            since data is only aligned to VEC_SIZE.  */
> >  # ifdef USE_AS_STRNLEN
> > -       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> because
> > -          it simplies the logic in last_4x_vec_or_less.  */
> > +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> > +          because it simplies the logic in last_4x_vec_or_less.  */
> >         leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> >         subq    %rdx, %rcx
> > +#  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> > +       sarl    $2, %ecx
> > +#  endif
> >  # endif
> >         /* Load first VEC regardless.  */
> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > @@ -207,34 +237,38 @@ L(cross_page_continue):
> >         subq    %rcx, %rsi
> >         jb      L(last_4x_vec_or_less)
> >  # endif
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x1)
> >
> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x2)
> >
> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x3)
> >
> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x4)
> >
> >         /* Align data to VEC_SIZE * 4 - 1.  */
> >  # ifdef USE_AS_STRNLEN
> >         /* Before adjusting length check if at last VEC_SIZE * 4.  */
> > -       cmpq    $(VEC_SIZE * 4 - 1), %rsi
> > +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
> >         jbe     L(last_4x_vec_or_less_load)
> >         incq    %rdi
> >         movl    %edi, %ecx
> >         orq     $(VEC_SIZE * 4 - 1), %rdi
> >         andl    $(VEC_SIZE * 4 - 1), %ecx
> > +#  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> > +       sarl    $2, %ecx
> > +#  endif
> >         /* Readjust length.  */
> >         addq    %rcx, %rsi
> >  # else
> > @@ -246,13 +280,13 @@ L(cross_page_continue):
> >  L(loop_4x_vec):
> >  # ifdef USE_AS_STRNLEN
> >         /* Break if at end of length.  */
> > -       subq    $(VEC_SIZE * 4), %rsi
> > +       subq    $(CHAR_PER_VEC * 4), %rsi
> >         jb      L(last_4x_vec_or_less_cmpeq)
> >  # endif
> > -       /* Save some code size by microfusing VPMINU with the load. Since
> > -          the matches in ymm2/ymm4 can only be returned if there where
> no
> > -          matches in ymm1/ymm3 respectively there is no issue with
> overlap.
> > -        */
> > +       /* Save some code size by microfusing VPMINU with the load.
> > +          Since the matches in ymm2/ymm4 can only be returned if there
> > +          where no matches in ymm1/ymm3 respectively there is no issue
> > +          with overlap.  */
> >         vmovdqa 1(%rdi), %ymm1
> >         VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> >         vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
> >
> >         VPMINU  %ymm2, %ymm4, %ymm5
> >         VPCMPEQ %ymm5, %ymm0, %ymm5
> > -       vpmovmskb       %ymm5, %ecx
> > +       vpmovmskb %ymm5, %ecx
> >
> >         subq    $-(VEC_SIZE * 4), %rdi
> >         testl   %ecx, %ecx
> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
> >
> >
> >         VPCMPEQ %ymm1, %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         subq    %rdx, %rdi
> >         testl   %eax, %eax
> >         jnz     L(last_vec_return_x0)
> >
> >         VPCMPEQ %ymm2, %ymm0, %ymm2
> > -       vpmovmskb       %ymm2, %eax
> > +       vpmovmskb %ymm2, %eax
> >         testl   %eax, %eax
> >         jnz     L(last_vec_return_x1)
> >
> >         /* Combine last 2 VEC.  */
> >         VPCMPEQ %ymm3, %ymm0, %ymm3
> > -       vpmovmskb       %ymm3, %eax
> > -       /* rcx has combined result from all 4 VEC. It will only be used
> if
> > -          the first 3 other VEC all did not contain a match.  */
> > +       vpmovmskb %ymm3, %eax
> > +       /* rcx has combined result from all 4 VEC. It will only be used
> > +          if the first 3 other VEC all did not contain a match.  */
> >         salq    $32, %rcx
> >         orq     %rcx, %rax
> >         tzcntq  %rax, %rax
> >         subq    $(VEC_SIZE * 2 - 1), %rdi
> >         addq    %rdi, %rax
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
> >  # ifdef USE_AS_STRNLEN
> >         .p2align 4
> >  L(last_4x_vec_or_less_load):
> > -       /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> > +       /* Depending on entry adjust rdi / prepare first VEC in ymm1.
> > +        */
> >         subq    $-(VEC_SIZE * 4), %rdi
> >  L(last_4x_vec_or_less_cmpeq):
> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >  L(last_4x_vec_or_less):
> > -
> > -       vpmovmskb       %ymm1, %eax
> > -       /* If remaining length > VEC_SIZE * 2. This works if esi is off
> by
> > -          VEC_SIZE * 4.  */
> > +#  ifdef USE_AS_WCSLEN
> > +       /* NB: Multiply length by 4 to get byte count.  */
> > +       sall    $2, %esi
> > +#  endif
> > +       vpmovmskb %ymm1, %eax
> > +       /* If remaining length > VEC_SIZE * 2. This works if esi is off
> > +          by VEC_SIZE * 4.  */
> >         testl   $(VEC_SIZE * 2), %esi
> >         jnz     L(last_4x_vec)
> >
> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
> >         jb      L(max)
> >
> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         tzcntl  %eax, %eax
> >         /* Check the end of data.  */
> >         cmpl    %eax, %esi
> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
> >         addl    $(VEC_SIZE + 1), %eax
> >         addq    %rdi, %rax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  #  endif
> >         VZEROUPPER_RETURN
> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
> >         subq    $(VEC_SIZE * 4 - 1), %rdi
> >         addq    %rdi, %rax
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
> >         subq    $(VEC_SIZE * 3 - 1), %rdi
> >         addq    %rdi, %rax
> >  # ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  # endif
> >         VZEROUPPER_RETURN
> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
> >         incl    %eax
> >         addq    %rdi, %rax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  #  endif
> >         VZEROUPPER_RETURN
> > @@ -381,14 +424,14 @@ L(last_4x_vec):
> >         jnz     L(last_vec_x1)
> >
> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> >         jnz     L(last_vec_x2)
> >
> >         /* Normalize length.  */
> >         andl    $(VEC_SIZE * 4 - 1), %esi
> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         testl   %eax, %eax
> >         jnz     L(last_vec_x3)
> >
> > @@ -396,7 +439,7 @@ L(last_4x_vec):
> >         jb      L(max)
> >
> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         tzcntl  %eax, %eax
> >         /* Check the end of data.  */
> >         cmpl    %eax, %esi
> > @@ -405,6 +448,7 @@ L(last_4x_vec):
> >         addl    $(VEC_SIZE * 3 + 1), %eax
> >         addq    %rdi, %rax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  #  endif
> >         VZEROUPPER_RETURN
> > @@ -419,6 +463,7 @@ L(last_vec_x1):
> >         incl    %eax
> >         addq    %rdi, %rax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  #  endif
> >         VZEROUPPER_RETURN
> > @@ -432,6 +477,7 @@ L(last_vec_x2):
> >         addl    $(VEC_SIZE + 1), %eax
> >         addq    %rdi, %rax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  #  endif
> >         VZEROUPPER_RETURN
> > @@ -447,6 +493,7 @@ L(last_vec_x3):
> >         addl    $(VEC_SIZE * 2 + 1), %eax
> >         addq    %rdi, %rax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >         shrq    $2, %rax
> >  #  endif
> >         VZEROUPPER_RETURN
> > @@ -455,13 +502,13 @@ L(max_end):
> >         VZEROUPPER_RETURN
> >  # endif
> >
> > -       /* Cold case for crossing page with first load.  */
> > +       /* Cold case for crossing page with first load.  */
> >         .p2align 4
> >  L(cross_page_boundary):
> >         /* Align data to VEC_SIZE - 1.  */
> >         orq     $(VEC_SIZE - 1), %rdi
> >         VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> > -       vpmovmskb       %ymm1, %eax
> > +       vpmovmskb %ymm1, %eax
> >         /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> >            so no need to manually mod rdx.  */
> >         sarxl   %edx, %eax, %eax
> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
> >         jnz     L(cross_page_less_vec)
> >         leaq    1(%rdi), %rcx
> >         subq    %rdx, %rcx
> > +#  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> > +       shrl    $2, %ecx
> > +#  endif
> >         /* Check length.  */
> >         cmpq    %rsi, %rcx
> >         jb      L(cross_page_continue)
> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
> >         jz      L(cross_page_continue)
> >         tzcntl  %eax, %eax
> >  #  ifdef USE_AS_WCSLEN
> > +       /* NB: Divide length by 4 to get wchar_t count.  */
> >         shrl    $2, %eax
> >  #  endif
> >  # endif
> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
> >         .p2align 4
> >  L(cross_page_less_vec):
> >         tzcntl  %eax, %eax
> > +#  ifdef USE_AS_WCSLEN
> > +       /* NB: Multiply length by 4 to get byte count.  */
> > +       sall    $2, %esi
> > +#  endif
> >         cmpq    %rax, %rsi
> >         cmovb   %esi, %eax
> >  #  ifdef USE_AS_WCSLEN
> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> > index d223ea1700..3fc6734910 100644
> > --- a/sysdeps/x86_64/strlen.S
> > +++ b/sysdeps/x86_64/strlen.S
> > @@ -65,12 +65,24 @@ ENTRY(strlen)
> >         ret
> >  L(n_nonzero):
> >  # ifdef AS_WCSLEN
> > -       shl     $2, %RSI_LP
> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> > +   overflow the only way this program doesn't have undefined behavior
> > +   is if there is a null terminator in valid memory so strlen will
> > +   suffice.  */
> > +       mov     %RSI_LP, %R10_LP
> > +       sar     $62, %R10_LP
> > +       test    %R10_LP, %R10_LP
> > +       jnz     __wcslen_sse2
>
> Branch to  __wcslen_sse2 is wrong for 2 reasons:
>
> 1.  __wcslen_sse2 is undefined with --disable-multi-arch.
>
Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?


> 2. You should skip ENDBR64 at function entry.
>
> Please create a new label and branch to it.
>
> I am not quite sure how to do this. I am trying to use
strstr-sse2-unaligned.S as a template:
https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
which appears to make a direct call to the global label of __strchr_sse2
without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.

Is there an example in the code you know of I can follow?


> > +       sal     $2, %RSI_LP
> >  # endif
> >
> >  /* Initialize long lived registers.  */
> >
> >         add     %RDI_LP, %RSI_LP
> > +# ifdef AS_WCSLEN
> > +/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
> > +       jbe     __wcslen_sse2
> > +# endif
> >         mov     %RSI_LP, %R10_LP
> >         and     $-64, %R10_LP
> >         mov     %RSI_LP, %R11_LP
> > --
> > 2.25.1
> >
>
> Thanks.
>
>
> --
> H.J.
>
H.J. Lu June 22, 2021, 11:28 p.m. UTC | #3
On Tue, Jun 22, 2021 at 4:16 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
> On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >
>> > This commit fixes the bug mentioned in the previous commit.
>> >
>> > The previous implementations of wmemchr in these files relied
>> > on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
>> >
>> > The new overflow tests added in the previous commit now
>> > pass (As well as all the other tests).
>> >
>> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
>> > ---
>> >  sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
>> >  sysdeps/x86_64/strlen.S                |  14 ++-
>> >  2 files changed, 106 insertions(+), 38 deletions(-)
>> >
>> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
>> > index bd2e6ee44a..b282a75613 100644
>> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
>> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
>> > @@ -44,21 +44,21 @@
>> >
>> >  # define VEC_SIZE 32
>> >  # define PAGE_SIZE 4096
>> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>> >
>> >         .section SECTION(.text),"ax",@progbits
>> >  ENTRY (STRLEN)
>> >  # ifdef USE_AS_STRNLEN
>> >         /* Check zero length.  */
>> > +#  ifdef __ILP32__
>> > +       /* Clear upper bits.  */
>> > +       and     %RSI_LP, %RSI_LP
>> > +#  else
>> >         test    %RSI_LP, %RSI_LP
>> > +#  endif
>> >         jz      L(zero)
>> >         /* Store max len in R8_LP before adjusting if using WCSLEN.  */
>> >         mov     %RSI_LP, %R8_LP
>> > -#  ifdef USE_AS_WCSLEN
>> > -       shl     $2, %RSI_LP
>> > -#  elif defined __ILP32__
>> > -       /* Clear the upper 32 bits.  */
>> > -       movl    %esi, %esi
>> > -#  endif
>> >  # endif
>> >         movl    %edi, %eax
>> >         movq    %rdi, %rdx
>> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
>> >
>> >         /* Check the first VEC_SIZE bytes.  */
>> >         VPCMPEQ (%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >  # ifdef USE_AS_STRNLEN
>> >         /* If length < VEC_SIZE handle special.  */
>> > -       cmpq    $VEC_SIZE, %rsi
>> > +       cmpq    $CHAR_PER_VEC, %rsi
>> >         jbe     L(first_vec_x0)
>> >  # endif
>> >         /* If empty continue to aligned_more. Otherwise return bit
>> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
>> >         jz      L(aligned_more)
>> >         tzcntl  %eax, %eax
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrl    $2, %eax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -97,9 +98,14 @@ L(zero):
>> >  L(first_vec_x0):
>> >         /* Set bit for max len so that tzcnt will return min of max len
>> >            and position of first match.  */
>> > +#  ifdef USE_AS_WCSLEN
>> > +       /* NB: Multiply length by 4 to get byte count.  */
>> > +       sall    $2, %esi
>> > +#  endif
>> >         btsq    %rsi, %rax
>> >         tzcntl  %eax, %eax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrl    $2, %eax
>> >  #  endif
>> >         VZEROUPPER_RETURN
>> > @@ -113,14 +119,19 @@ L(first_vec_x1):
>> >  # ifdef USE_AS_STRNLEN
>> >         /* Use ecx which was computed earlier to compute correct value.
>> >          */
>> > +#  ifdef USE_AS_WCSLEN
>> > +       leal    -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
>> > +#  else
>> >         subl    $(VEC_SIZE * 4 + 1), %ecx
>> >         addl    %ecx, %eax
>> > +#  endif
>> >  # else
>> >         subl    %edx, %edi
>> >         incl    %edi
>> >         addl    %edi, %eax
>> >  # endif
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrl    $2, %eax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -133,14 +144,19 @@ L(first_vec_x2):
>> >  # ifdef USE_AS_STRNLEN
>> >         /* Use ecx which was computed earlier to compute correct value.
>> >          */
>> > +#  ifdef USE_AS_WCSLEN
>> > +       leal    -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
>> > +#  else
>> >         subl    $(VEC_SIZE * 3 + 1), %ecx
>> >         addl    %ecx, %eax
>> > +#  endif
>> >  # else
>> >         subl    %edx, %edi
>> >         addl    $(VEC_SIZE + 1), %edi
>> >         addl    %edi, %eax
>> >  # endif
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrl    $2, %eax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -153,14 +169,19 @@ L(first_vec_x3):
>> >  # ifdef USE_AS_STRNLEN
>> >         /* Use ecx which was computed earlier to compute correct value.
>> >          */
>> > +#  ifdef USE_AS_WCSLEN
>> > +       leal    -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
>> > +#  else
>> >         subl    $(VEC_SIZE * 2 + 1), %ecx
>> >         addl    %ecx, %eax
>> > +#  endif
>> >  # else
>> >         subl    %edx, %edi
>> >         addl    $(VEC_SIZE * 2 + 1), %edi
>> >         addl    %edi, %eax
>> >  # endif
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrl    $2, %eax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -173,14 +194,19 @@ L(first_vec_x4):
>> >  # ifdef USE_AS_STRNLEN
>> >         /* Use ecx which was computed earlier to compute correct value.
>> >          */
>> > +#  ifdef USE_AS_WCSLEN
>> > +       leal    -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
>> > +#  else
>> >         subl    $(VEC_SIZE + 1), %ecx
>> >         addl    %ecx, %eax
>> > +#  endif
>> >  # else
>> >         subl    %edx, %edi
>> >         addl    $(VEC_SIZE * 3 + 1), %edi
>> >         addl    %edi, %eax
>> >  # endif
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrl    $2, %eax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -195,10 +221,14 @@ L(cross_page_continue):
>> >         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>> >            since data is only aligned to VEC_SIZE.  */
>> >  # ifdef USE_AS_STRNLEN
>> > -       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
>> > -          it simplies the logic in last_4x_vec_or_less.  */
>> > +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
>> > +          because it simplies the logic in last_4x_vec_or_less.  */
>> >         leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
>> >         subq    %rdx, %rcx
>> > +#  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
>> > +       sarl    $2, %ecx
>> > +#  endif
>> >  # endif
>> >         /* Load first VEC regardless.  */
>> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
>> > @@ -207,34 +237,38 @@ L(cross_page_continue):
>> >         subq    %rcx, %rsi
>> >         jb      L(last_4x_vec_or_less)
>> >  # endif
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         testl   %eax, %eax
>> >         jnz     L(first_vec_x1)
>> >
>> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         testl   %eax, %eax
>> >         jnz     L(first_vec_x2)
>> >
>> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         testl   %eax, %eax
>> >         jnz     L(first_vec_x3)
>> >
>> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         testl   %eax, %eax
>> >         jnz     L(first_vec_x4)
>> >
>> >         /* Align data to VEC_SIZE * 4 - 1.  */
>> >  # ifdef USE_AS_STRNLEN
>> >         /* Before adjusting length check if at last VEC_SIZE * 4.  */
>> > -       cmpq    $(VEC_SIZE * 4 - 1), %rsi
>> > +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
>> >         jbe     L(last_4x_vec_or_less_load)
>> >         incq    %rdi
>> >         movl    %edi, %ecx
>> >         orq     $(VEC_SIZE * 4 - 1), %rdi
>> >         andl    $(VEC_SIZE * 4 - 1), %ecx
>> > +#  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
>> > +       sarl    $2, %ecx
>> > +#  endif
>> >         /* Readjust length.  */
>> >         addq    %rcx, %rsi
>> >  # else
>> > @@ -246,13 +280,13 @@ L(cross_page_continue):
>> >  L(loop_4x_vec):
>> >  # ifdef USE_AS_STRNLEN
>> >         /* Break if at end of length.  */
>> > -       subq    $(VEC_SIZE * 4), %rsi
>> > +       subq    $(CHAR_PER_VEC * 4), %rsi
>> >         jb      L(last_4x_vec_or_less_cmpeq)
>> >  # endif
>> > -       /* Save some code size by microfusing VPMINU with the load. Since
>> > -          the matches in ymm2/ymm4 can only be returned if there where no
>> > -          matches in ymm1/ymm3 respectively there is no issue with overlap.
>> > -        */
>> > +       /* Save some code size by microfusing VPMINU with the load.
>> > +          Since the matches in ymm2/ymm4 can only be returned if there
>> > +          where no matches in ymm1/ymm3 respectively there is no issue
>> > +          with overlap.  */
>> >         vmovdqa 1(%rdi), %ymm1
>> >         VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
>> >         vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
>> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
>> >
>> >         VPMINU  %ymm2, %ymm4, %ymm5
>> >         VPCMPEQ %ymm5, %ymm0, %ymm5
>> > -       vpmovmskb       %ymm5, %ecx
>> > +       vpmovmskb %ymm5, %ecx
>> >
>> >         subq    $-(VEC_SIZE * 4), %rdi
>> >         testl   %ecx, %ecx
>> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
>> >
>> >
>> >         VPCMPEQ %ymm1, %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         subq    %rdx, %rdi
>> >         testl   %eax, %eax
>> >         jnz     L(last_vec_return_x0)
>> >
>> >         VPCMPEQ %ymm2, %ymm0, %ymm2
>> > -       vpmovmskb       %ymm2, %eax
>> > +       vpmovmskb %ymm2, %eax
>> >         testl   %eax, %eax
>> >         jnz     L(last_vec_return_x1)
>> >
>> >         /* Combine last 2 VEC.  */
>> >         VPCMPEQ %ymm3, %ymm0, %ymm3
>> > -       vpmovmskb       %ymm3, %eax
>> > -       /* rcx has combined result from all 4 VEC. It will only be used if
>> > -          the first 3 other VEC all did not contain a match.  */
>> > +       vpmovmskb %ymm3, %eax
>> > +       /* rcx has combined result from all 4 VEC. It will only be used
>> > +          if the first 3 other VEC all did not contain a match.  */
>> >         salq    $32, %rcx
>> >         orq     %rcx, %rax
>> >         tzcntq  %rax, %rax
>> >         subq    $(VEC_SIZE * 2 - 1), %rdi
>> >         addq    %rdi, %rax
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
>> >  # ifdef USE_AS_STRNLEN
>> >         .p2align 4
>> >  L(last_4x_vec_or_less_load):
>> > -       /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
>> > +       /* Depending on entry adjust rdi / prepare first VEC in ymm1.
>> > +        */
>> >         subq    $-(VEC_SIZE * 4), %rdi
>> >  L(last_4x_vec_or_less_cmpeq):
>> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
>> >  L(last_4x_vec_or_less):
>> > -
>> > -       vpmovmskb       %ymm1, %eax
>> > -       /* If remaining length > VEC_SIZE * 2. This works if esi is off by
>> > -          VEC_SIZE * 4.  */
>> > +#  ifdef USE_AS_WCSLEN
>> > +       /* NB: Multiply length by 4 to get byte count.  */
>> > +       sall    $2, %esi
>> > +#  endif
>> > +       vpmovmskb %ymm1, %eax
>> > +       /* If remaining length > VEC_SIZE * 2. This works if esi is off
>> > +          by VEC_SIZE * 4.  */
>> >         testl   $(VEC_SIZE * 2), %esi
>> >         jnz     L(last_4x_vec)
>> >
>> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
>> >         jb      L(max)
>> >
>> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         tzcntl  %eax, %eax
>> >         /* Check the end of data.  */
>> >         cmpl    %eax, %esi
>> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
>> >         addl    $(VEC_SIZE + 1), %eax
>> >         addq    %rdi, %rax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  #  endif
>> >         VZEROUPPER_RETURN
>> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
>> >         subq    $(VEC_SIZE * 4 - 1), %rdi
>> >         addq    %rdi, %rax
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
>> >         subq    $(VEC_SIZE * 3 - 1), %rdi
>> >         addq    %rdi, %rax
>> >  # ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  # endif
>> >         VZEROUPPER_RETURN
>> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
>> >         incl    %eax
>> >         addq    %rdi, %rax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  #  endif
>> >         VZEROUPPER_RETURN
>> > @@ -381,14 +424,14 @@ L(last_4x_vec):
>> >         jnz     L(last_vec_x1)
>> >
>> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         testl   %eax, %eax
>> >         jnz     L(last_vec_x2)
>> >
>> >         /* Normalize length.  */
>> >         andl    $(VEC_SIZE * 4 - 1), %esi
>> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         testl   %eax, %eax
>> >         jnz     L(last_vec_x3)
>> >
>> > @@ -396,7 +439,7 @@ L(last_4x_vec):
>> >         jb      L(max)
>> >
>> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         tzcntl  %eax, %eax
>> >         /* Check the end of data.  */
>> >         cmpl    %eax, %esi
>> > @@ -405,6 +448,7 @@ L(last_4x_vec):
>> >         addl    $(VEC_SIZE * 3 + 1), %eax
>> >         addq    %rdi, %rax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  #  endif
>> >         VZEROUPPER_RETURN
>> > @@ -419,6 +463,7 @@ L(last_vec_x1):
>> >         incl    %eax
>> >         addq    %rdi, %rax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  #  endif
>> >         VZEROUPPER_RETURN
>> > @@ -432,6 +477,7 @@ L(last_vec_x2):
>> >         addl    $(VEC_SIZE + 1), %eax
>> >         addq    %rdi, %rax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  #  endif
>> >         VZEROUPPER_RETURN
>> > @@ -447,6 +493,7 @@ L(last_vec_x3):
>> >         addl    $(VEC_SIZE * 2 + 1), %eax
>> >         addq    %rdi, %rax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >         shrq    $2, %rax
>> >  #  endif
>> >         VZEROUPPER_RETURN
>> > @@ -455,13 +502,13 @@ L(max_end):
>> >         VZEROUPPER_RETURN
>> >  # endif
>> >
>> > -       /* Cold case for crossing page with first load.  */
>> > +       /* Cold case for crossing page with first load.  */
>> >         .p2align 4
>> >  L(cross_page_boundary):
>> >         /* Align data to VEC_SIZE - 1.  */
>> >         orq     $(VEC_SIZE - 1), %rdi
>> >         VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
>> > -       vpmovmskb       %ymm1, %eax
>> > +       vpmovmskb %ymm1, %eax
>> >         /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
>> >            so no need to manually mod rdx.  */
>> >         sarxl   %edx, %eax, %eax
>> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
>> >         jnz     L(cross_page_less_vec)
>> >         leaq    1(%rdi), %rcx
>> >         subq    %rdx, %rcx
>> > +#  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> > +       shrl    $2, %ecx
>> > +#  endif
>> >         /* Check length.  */
>> >         cmpq    %rsi, %rcx
>> >         jb      L(cross_page_continue)
>> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
>> >         jz      L(cross_page_continue)
>> >         tzcntl  %eax, %eax
>> >  #  ifdef USE_AS_WCSLEN
>> > +       /* NB: Divide length by 4 to get wchar_t count.  */
>> >         shrl    $2, %eax
>> >  #  endif
>> >  # endif
>> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
>> >         .p2align 4
>> >  L(cross_page_less_vec):
>> >         tzcntl  %eax, %eax
>> > +#  ifdef USE_AS_WCSLEN
>> > +       /* NB: Multiply length by 4 to get byte count.  */
>> > +       sall    $2, %esi
>> > +#  endif
>> >         cmpq    %rax, %rsi
>> >         cmovb   %esi, %eax
>> >  #  ifdef USE_AS_WCSLEN
>> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
>> > index d223ea1700..3fc6734910 100644
>> > --- a/sysdeps/x86_64/strlen.S
>> > +++ b/sysdeps/x86_64/strlen.S
>> > @@ -65,12 +65,24 @@ ENTRY(strlen)
>> >         ret
>> >  L(n_nonzero):
>> >  # ifdef AS_WCSLEN
>> > -       shl     $2, %RSI_LP
>> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
>> > +   overflow the only way this program doesn't have undefined behavior
>> > +   is if there is a null terminator in valid memory so strlen will
>> > +   suffice.  */
>> > +       mov     %RSI_LP, %R10_LP
>> > +       sar     $62, %R10_LP
>> > +       test    %R10_LP, %R10_LP
>> > +       jnz     __wcslen_sse2
>>
>> Branch to  __wcslen_sse2 is wrong for 2 reasons:
>>
>> 1.  __wcslen_sse2 is undefined with --disable-multi-arch.
>
> Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?
>
>>
>> 2. You should skip ENDBR64 at function entry.
>>
>> Please create a new label and branch to it.
>>
> I am not quite sure how to do this. I am trying to use
> strstr-sse2-unaligned.S as a template:
> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
> which appears to make a direct call to the global label of __strchr_sse2
> without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.

This is different since all files are in sysdeps/x86_64/multiarch.

> Is there an example in the code you know of I can follow?

There are no exact same codes.

memmove-vec-unaligned-erms.S has

ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
        movq    %rdi, %rax
L(start):   <<<<<<<<<<<<<<  This is equivalent to __wcslen_sse2.
# ifdef __ILP32__
        /* Clear the upper 32 bits.  */
        movl    %edx, %edx
# endif

>>
>> > +       sal     $2, %RSI_LP
>> >  # endif
>> >
>> >  /* Initialize long lived registers.  */
>> >
>> >         add     %RDI_LP, %RSI_LP
>> > +# ifdef AS_WCSLEN
>> > +/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
>> > +       jbe     __wcslen_sse2
>> > +# endif
>> >         mov     %RSI_LP, %R10_LP
>> >         and     $-64, %R10_LP
>> >         mov     %RSI_LP, %R11_LP
>> > --
>> > 2.25.1
>> >
>>
>> Thanks.
>>
>>
>> --
>> H.J.
Noah Goldstein June 23, 2021, 3:11 a.m. UTC | #4
On Tue, Jun 22, 2021 at 7:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> On Tue, Jun 22, 2021 at 4:16 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> >
> >
> > On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <
> goldstein.w.n@gmail.com> wrote:
> >> >
> >> > This commit fixes the bug mentioned in the previous commit.
> >> >
> >> > The previous implementations of wmemchr in these files relied
> >> > on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
> >> >
> >> > The new overflow tests added in the previous commit now
> >> > pass (As well as all the other tests).
> >> >
> >> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> >> > ---
> >> >  sysdeps/x86_64/multiarch/strlen-avx2.S | 130
> ++++++++++++++++++-------
> >> >  sysdeps/x86_64/strlen.S                |  14 ++-
> >> >  2 files changed, 106 insertions(+), 38 deletions(-)
> >> >
> >> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S
> b/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> > index bd2e6ee44a..b282a75613 100644
> >> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> > @@ -44,21 +44,21 @@
> >> >
> >> >  # define VEC_SIZE 32
> >> >  # define PAGE_SIZE 4096
> >> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> >> >
> >> >         .section SECTION(.text),"ax",@progbits
> >> >  ENTRY (STRLEN)
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* Check zero length.  */
> >> > +#  ifdef __ILP32__
> >> > +       /* Clear upper bits.  */
> >> > +       and     %RSI_LP, %RSI_LP
> >> > +#  else
> >> >         test    %RSI_LP, %RSI_LP
> >> > +#  endif
> >> >         jz      L(zero)
> >> >         /* Store max len in R8_LP before adjusting if using WCSLEN.
> */
> >> >         mov     %RSI_LP, %R8_LP
> >> > -#  ifdef USE_AS_WCSLEN
> >> > -       shl     $2, %RSI_LP
> >> > -#  elif defined __ILP32__
> >> > -       /* Clear the upper 32 bits.  */
> >> > -       movl    %esi, %esi
> >> > -#  endif
> >> >  # endif
> >> >         movl    %edi, %eax
> >> >         movq    %rdi, %rdx
> >> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
> >> >
> >> >         /* Check the first VEC_SIZE bytes.  */
> >> >         VPCMPEQ (%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* If length < VEC_SIZE handle special.  */
> >> > -       cmpq    $VEC_SIZE, %rsi
> >> > +       cmpq    $CHAR_PER_VEC, %rsi
> >> >         jbe     L(first_vec_x0)
> >> >  # endif
> >> >         /* If empty continue to aligned_more. Otherwise return bit
> >> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
> >> >         jz      L(aligned_more)
> >> >         tzcntl  %eax, %eax
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrl    $2, %eax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -97,9 +98,14 @@ L(zero):
> >> >  L(first_vec_x0):
> >> >         /* Set bit for max len so that tzcnt will return min of max
> len
> >> >            and position of first match.  */
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Multiply length by 4 to get byte count.  */
> >> > +       sall    $2, %esi
> >> > +#  endif
> >> >         btsq    %rsi, %rax
> >> >         tzcntl  %eax, %eax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrl    $2, %eax
> >> >  #  endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -113,14 +119,19 @@ L(first_vec_x1):
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >          */
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       leal    -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
> >> > +#  else
> >> >         subl    $(VEC_SIZE * 4 + 1), %ecx
> >> >         addl    %ecx, %eax
> >> > +#  endif
> >> >  # else
> >> >         subl    %edx, %edi
> >> >         incl    %edi
> >> >         addl    %edi, %eax
> >> >  # endif
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrl    $2, %eax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -133,14 +144,19 @@ L(first_vec_x2):
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >          */
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       leal    -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
> >> > +#  else
> >> >         subl    $(VEC_SIZE * 3 + 1), %ecx
> >> >         addl    %ecx, %eax
> >> > +#  endif
> >> >  # else
> >> >         subl    %edx, %edi
> >> >         addl    $(VEC_SIZE + 1), %edi
> >> >         addl    %edi, %eax
> >> >  # endif
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrl    $2, %eax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -153,14 +169,19 @@ L(first_vec_x3):
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >          */
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       leal    -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
> >> > +#  else
> >> >         subl    $(VEC_SIZE * 2 + 1), %ecx
> >> >         addl    %ecx, %eax
> >> > +#  endif
> >> >  # else
> >> >         subl    %edx, %edi
> >> >         addl    $(VEC_SIZE * 2 + 1), %edi
> >> >         addl    %edi, %eax
> >> >  # endif
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrl    $2, %eax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -173,14 +194,19 @@ L(first_vec_x4):
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >          */
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       leal    -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
> >> > +#  else
> >> >         subl    $(VEC_SIZE + 1), %ecx
> >> >         addl    %ecx, %eax
> >> > +#  endif
> >> >  # else
> >> >         subl    %edx, %edi
> >> >         addl    $(VEC_SIZE * 3 + 1), %edi
> >> >         addl    %edi, %eax
> >> >  # endif
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrl    $2, %eax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -195,10 +221,14 @@ L(cross_page_continue):
> >> >         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> >> >            since data is only aligned to VEC_SIZE.  */
> >> >  # ifdef USE_AS_STRNLEN
> >> > -       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> because
> >> > -          it simplies the logic in last_4x_vec_or_less.  */
> >> > +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> >> > +          because it simplies the logic in last_4x_vec_or_less.  */
> >> >         leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> >> >         subq    %rdx, %rcx
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> >> > +       sarl    $2, %ecx
> >> > +#  endif
> >> >  # endif
> >> >         /* Load first VEC regardless.  */
> >> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >> > @@ -207,34 +237,38 @@ L(cross_page_continue):
> >> >         subq    %rcx, %rsi
> >> >         jb      L(last_4x_vec_or_less)
> >> >  # endif
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         testl   %eax, %eax
> >> >         jnz     L(first_vec_x1)
> >> >
> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         testl   %eax, %eax
> >> >         jnz     L(first_vec_x2)
> >> >
> >> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         testl   %eax, %eax
> >> >         jnz     L(first_vec_x3)
> >> >
> >> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         testl   %eax, %eax
> >> >         jnz     L(first_vec_x4)
> >> >
> >> >         /* Align data to VEC_SIZE * 4 - 1.  */
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* Before adjusting length check if at last VEC_SIZE * 4.  */
> >> > -       cmpq    $(VEC_SIZE * 4 - 1), %rsi
> >> > +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
> >> >         jbe     L(last_4x_vec_or_less_load)
> >> >         incq    %rdi
> >> >         movl    %edi, %ecx
> >> >         orq     $(VEC_SIZE * 4 - 1), %rdi
> >> >         andl    $(VEC_SIZE * 4 - 1), %ecx
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> >> > +       sarl    $2, %ecx
> >> > +#  endif
> >> >         /* Readjust length.  */
> >> >         addq    %rcx, %rsi
> >> >  # else
> >> > @@ -246,13 +280,13 @@ L(cross_page_continue):
> >> >  L(loop_4x_vec):
> >> >  # ifdef USE_AS_STRNLEN
> >> >         /* Break if at end of length.  */
> >> > -       subq    $(VEC_SIZE * 4), %rsi
> >> > +       subq    $(CHAR_PER_VEC * 4), %rsi
> >> >         jb      L(last_4x_vec_or_less_cmpeq)
> >> >  # endif
> >> > -       /* Save some code size by microfusing VPMINU with the load.
> Since
> >> > -          the matches in ymm2/ymm4 can only be returned if there
> where no
> >> > -          matches in ymm1/ymm3 respectively there is no issue with
> overlap.
> >> > -        */
> >> > +       /* Save some code size by microfusing VPMINU with the load.
> >> > +          Since the matches in ymm2/ymm4 can only be returned if
> there
> >> > +          where no matches in ymm1/ymm3 respectively there is no
> issue
> >> > +          with overlap.  */
> >> >         vmovdqa 1(%rdi), %ymm1
> >> >         VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> >> >         vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> >> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
> >> >
> >> >         VPMINU  %ymm2, %ymm4, %ymm5
> >> >         VPCMPEQ %ymm5, %ymm0, %ymm5
> >> > -       vpmovmskb       %ymm5, %ecx
> >> > +       vpmovmskb %ymm5, %ecx
> >> >
> >> >         subq    $-(VEC_SIZE * 4), %rdi
> >> >         testl   %ecx, %ecx
> >> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
> >> >
> >> >
> >> >         VPCMPEQ %ymm1, %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         subq    %rdx, %rdi
> >> >         testl   %eax, %eax
> >> >         jnz     L(last_vec_return_x0)
> >> >
> >> >         VPCMPEQ %ymm2, %ymm0, %ymm2
> >> > -       vpmovmskb       %ymm2, %eax
> >> > +       vpmovmskb %ymm2, %eax
> >> >         testl   %eax, %eax
> >> >         jnz     L(last_vec_return_x1)
> >> >
> >> >         /* Combine last 2 VEC.  */
> >> >         VPCMPEQ %ymm3, %ymm0, %ymm3
> >> > -       vpmovmskb       %ymm3, %eax
> >> > -       /* rcx has combined result from all 4 VEC. It will only be
> used if
> >> > -          the first 3 other VEC all did not contain a match.  */
> >> > +       vpmovmskb %ymm3, %eax
> >> > +       /* rcx has combined result from all 4 VEC. It will only be
> used
> >> > +          if the first 3 other VEC all did not contain a match.  */
> >> >         salq    $32, %rcx
> >> >         orq     %rcx, %rax
> >> >         tzcntq  %rax, %rax
> >> >         subq    $(VEC_SIZE * 2 - 1), %rdi
> >> >         addq    %rdi, %rax
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
> >> >  # ifdef USE_AS_STRNLEN
> >> >         .p2align 4
> >> >  L(last_4x_vec_or_less_load):
> >> > -       /* Depending on entry adjust rdi / prepare first VEC in
> ymm1.  */
> >> > +       /* Depending on entry adjust rdi / prepare first VEC in ymm1.
> >> > +        */
> >> >         subq    $-(VEC_SIZE * 4), %rdi
> >> >  L(last_4x_vec_or_less_cmpeq):
> >> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >> >  L(last_4x_vec_or_less):
> >> > -
> >> > -       vpmovmskb       %ymm1, %eax
> >> > -       /* If remaining length > VEC_SIZE * 2. This works if esi is
> off by
> >> > -          VEC_SIZE * 4.  */
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Multiply length by 4 to get byte count.  */
> >> > +       sall    $2, %esi
> >> > +#  endif
> >> > +       vpmovmskb %ymm1, %eax
> >> > +       /* If remaining length > VEC_SIZE * 2. This works if esi is
> off
> >> > +          by VEC_SIZE * 4.  */
> >> >         testl   $(VEC_SIZE * 2), %esi
> >> >         jnz     L(last_4x_vec)
> >> >
> >> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
> >> >         jb      L(max)
> >> >
> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         tzcntl  %eax, %eax
> >> >         /* Check the end of data.  */
> >> >         cmpl    %eax, %esi
> >> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
> >> >         addl    $(VEC_SIZE + 1), %eax
> >> >         addq    %rdi, %rax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  #  endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
> >> >         subq    $(VEC_SIZE * 4 - 1), %rdi
> >> >         addq    %rdi, %rax
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
> >> >         subq    $(VEC_SIZE * 3 - 1), %rdi
> >> >         addq    %rdi, %rax
> >> >  # ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  # endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
> >> >         incl    %eax
> >> >         addq    %rdi, %rax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  #  endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -381,14 +424,14 @@ L(last_4x_vec):
> >> >         jnz     L(last_vec_x1)
> >> >
> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         testl   %eax, %eax
> >> >         jnz     L(last_vec_x2)
> >> >
> >> >         /* Normalize length.  */
> >> >         andl    $(VEC_SIZE * 4 - 1), %esi
> >> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         testl   %eax, %eax
> >> >         jnz     L(last_vec_x3)
> >> >
> >> > @@ -396,7 +439,7 @@ L(last_4x_vec):
> >> >         jb      L(max)
> >> >
> >> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         tzcntl  %eax, %eax
> >> >         /* Check the end of data.  */
> >> >         cmpl    %eax, %esi
> >> > @@ -405,6 +448,7 @@ L(last_4x_vec):
> >> >         addl    $(VEC_SIZE * 3 + 1), %eax
> >> >         addq    %rdi, %rax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  #  endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -419,6 +463,7 @@ L(last_vec_x1):
> >> >         incl    %eax
> >> >         addq    %rdi, %rax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  #  endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -432,6 +477,7 @@ L(last_vec_x2):
> >> >         addl    $(VEC_SIZE + 1), %eax
> >> >         addq    %rdi, %rax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  #  endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -447,6 +493,7 @@ L(last_vec_x3):
> >> >         addl    $(VEC_SIZE * 2 + 1), %eax
> >> >         addq    %rdi, %rax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >         shrq    $2, %rax
> >> >  #  endif
> >> >         VZEROUPPER_RETURN
> >> > @@ -455,13 +502,13 @@ L(max_end):
> >> >         VZEROUPPER_RETURN
> >> >  # endif
> >> >
> >> > -       /* Cold case for crossing page with first load.  */
> >> > +       /* Cold case for crossing page with first load.  */
> >> >         .p2align 4
> >> >  L(cross_page_boundary):
> >> >         /* Align data to VEC_SIZE - 1.  */
> >> >         orq     $(VEC_SIZE - 1), %rdi
> >> >         VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> >> > -       vpmovmskb       %ymm1, %eax
> >> > +       vpmovmskb %ymm1, %eax
> >> >         /* Remove the leading bytes. sarxl only uses bits [5:0] of
> COUNT
> >> >            so no need to manually mod rdx.  */
> >> >         sarxl   %edx, %eax, %eax
> >> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
> >> >         jnz     L(cross_page_less_vec)
> >> >         leaq    1(%rdi), %rcx
> >> >         subq    %rdx, %rcx
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> > +       shrl    $2, %ecx
> >> > +#  endif
> >> >         /* Check length.  */
> >> >         cmpq    %rsi, %rcx
> >> >         jb      L(cross_page_continue)
> >> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
> >> >         jz      L(cross_page_continue)
> >> >         tzcntl  %eax, %eax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Divide length by 4 to get wchar_t count.  */
> >> >         shrl    $2, %eax
> >> >  #  endif
> >> >  # endif
> >> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
> >> >         .p2align 4
> >> >  L(cross_page_less_vec):
> >> >         tzcntl  %eax, %eax
> >> > +#  ifdef USE_AS_WCSLEN
> >> > +       /* NB: Multiply length by 4 to get byte count.  */
> >> > +       sall    $2, %esi
> >> > +#  endif
> >> >         cmpq    %rax, %rsi
> >> >         cmovb   %esi, %eax
> >> >  #  ifdef USE_AS_WCSLEN
> >> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> >> > index d223ea1700..3fc6734910 100644
> >> > --- a/sysdeps/x86_64/strlen.S
> >> > +++ b/sysdeps/x86_64/strlen.S
> >> > @@ -65,12 +65,24 @@ ENTRY(strlen)
> >> >         ret
> >> >  L(n_nonzero):
> >> >  # ifdef AS_WCSLEN
> >> > -       shl     $2, %RSI_LP
> >> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> >> > +   overflow the only way this program doesn't have undefined behavior
> >> > +   is if there is a null terminator in valid memory so strlen will
> >> > +   suffice.  */
> >> > +       mov     %RSI_LP, %R10_LP
> >> > +       sar     $62, %R10_LP
> >> > +       test    %R10_LP, %R10_LP
> >> > +       jnz     __wcslen_sse2
> >>
> >> Branch to  __wcslen_sse2 is wrong for 2 reasons:
> >>
> >> 1.  __wcslen_sse2 is undefined with --disable-multi-arch.
> >
> > Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?
> >
> >>
> >> 2. You should skip ENDBR64 at function entry.
> >>
> >> Please create a new label and branch to it.
> >>
> > I am not quite sure how to do this. I am trying to use
> > strstr-sse2-unaligned.S as a template:
> >
> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
> > which appears to make a direct call to the global label of __strchr_sse2
> > without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.


> This is different since all files are in sysdeps/x86_64/multiarch.
>

I see. So it turns out we are missing wcslen_sse4_1 which strlen.S
can also implement (it passes all tests). Would jumping to that be
valid?

Otherwise I think the best bet is to add a target  for wcslen_sse4_1
and define it and wcsnlen_sse4_1 in the same file so the label is visible.
The only issue is the #defines in strlen.S need to all be protected which
is a bit messy. If we don't want to define wcslen_sse4_1 for whatever
reason, I already have this approach working with defining
wcsnlen_sse4_1 in the same file as wcslen-sse2.S and entering from
a local label. But looking at the code it seems the strlen.S file is a bit
better optimized. Thoughts?


> > Is there an example in the code you know of I can follow?
>
> There are no exact same codes.
>
> memmove-vec-unaligned-erms.S has
>
> ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
>         movq    %rdi, %rax
> L(start):   <<<<<<<<<<<<<<  This is equivalent to __wcslen_sse2.
> # ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         movl    %edx, %edx
> # endif


> >>
> >> > +       sal     $2, %RSI_LP
> >> >  # endif
> >> >
> >> >  /* Initialize long lived registers.  */
> >> >
> >> >         add     %RDI_LP, %RSI_LP
> >> > +# ifdef AS_WCSLEN
> >> > +/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
> >> > +       jbe     __wcslen_sse2
> >> > +# endif
> >> >         mov     %RSI_LP, %R10_LP
> >> >         and     $-64, %R10_LP
> >> >         mov     %RSI_LP, %R11_LP
> >> > --
> >> > 2.25.1
> >> >
> >>
> >> Thanks.
> >>
> >>
> >> --
> >> H.J.
>
>
>
> --
> H.J.
>
H.J. Lu June 23, 2021, 3:58 a.m. UTC | #5
On Tue, Jun 22, 2021 at 8:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
>
> On Tue, Jun 22, 2021 at 7:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Tue, Jun 22, 2021 at 4:16 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >
>> >
>> >
>> > On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>> >>
>> >> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >> >
>> >> > This commit fixes the bug mentioned in the previous commit.
>> >> >
>> >> > The previous implementations of wmemchr in these files relied
>> >> > on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
>> >> >
>> >> > The new overflow tests added in the previous commit now
>> >> > pass (As well as all the other tests).
>> >> >
>> >> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
>> >> > ---
>> >> >  sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
>> >> >  sysdeps/x86_64/strlen.S                |  14 ++-
>> >> >  2 files changed, 106 insertions(+), 38 deletions(-)
>> >> >
>> >> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
>> >> > index bd2e6ee44a..b282a75613 100644
>> >> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
>> >> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
>> >> > @@ -44,21 +44,21 @@
>> >> >
>> >> >  # define VEC_SIZE 32
>> >> >  # define PAGE_SIZE 4096
>> >> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>> >> >
>> >> >         .section SECTION(.text),"ax",@progbits
>> >> >  ENTRY (STRLEN)
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* Check zero length.  */
>> >> > +#  ifdef __ILP32__
>> >> > +       /* Clear upper bits.  */
>> >> > +       and     %RSI_LP, %RSI_LP
>> >> > +#  else
>> >> >         test    %RSI_LP, %RSI_LP
>> >> > +#  endif
>> >> >         jz      L(zero)
>> >> >         /* Store max len in R8_LP before adjusting if using WCSLEN.  */
>> >> >         mov     %RSI_LP, %R8_LP
>> >> > -#  ifdef USE_AS_WCSLEN
>> >> > -       shl     $2, %RSI_LP
>> >> > -#  elif defined __ILP32__
>> >> > -       /* Clear the upper 32 bits.  */
>> >> > -       movl    %esi, %esi
>> >> > -#  endif
>> >> >  # endif
>> >> >         movl    %edi, %eax
>> >> >         movq    %rdi, %rdx
>> >> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
>> >> >
>> >> >         /* Check the first VEC_SIZE bytes.  */
>> >> >         VPCMPEQ (%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* If length < VEC_SIZE handle special.  */
>> >> > -       cmpq    $VEC_SIZE, %rsi
>> >> > +       cmpq    $CHAR_PER_VEC, %rsi
>> >> >         jbe     L(first_vec_x0)
>> >> >  # endif
>> >> >         /* If empty continue to aligned_more. Otherwise return bit
>> >> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
>> >> >         jz      L(aligned_more)
>> >> >         tzcntl  %eax, %eax
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrl    $2, %eax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -97,9 +98,14 @@ L(zero):
>> >> >  L(first_vec_x0):
>> >> >         /* Set bit for max len so that tzcnt will return min of max len
>> >> >            and position of first match.  */
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Multiply length by 4 to get byte count.  */
>> >> > +       sall    $2, %esi
>> >> > +#  endif
>> >> >         btsq    %rsi, %rax
>> >> >         tzcntl  %eax, %eax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrl    $2, %eax
>> >> >  #  endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -113,14 +119,19 @@ L(first_vec_x1):
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* Use ecx which was computed earlier to compute correct value.
>> >> >          */
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       leal    -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
>> >> > +#  else
>> >> >         subl    $(VEC_SIZE * 4 + 1), %ecx
>> >> >         addl    %ecx, %eax
>> >> > +#  endif
>> >> >  # else
>> >> >         subl    %edx, %edi
>> >> >         incl    %edi
>> >> >         addl    %edi, %eax
>> >> >  # endif
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrl    $2, %eax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -133,14 +144,19 @@ L(first_vec_x2):
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* Use ecx which was computed earlier to compute correct value.
>> >> >          */
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       leal    -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
>> >> > +#  else
>> >> >         subl    $(VEC_SIZE * 3 + 1), %ecx
>> >> >         addl    %ecx, %eax
>> >> > +#  endif
>> >> >  # else
>> >> >         subl    %edx, %edi
>> >> >         addl    $(VEC_SIZE + 1), %edi
>> >> >         addl    %edi, %eax
>> >> >  # endif
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrl    $2, %eax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -153,14 +169,19 @@ L(first_vec_x3):
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* Use ecx which was computed earlier to compute correct value.
>> >> >          */
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       leal    -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
>> >> > +#  else
>> >> >         subl    $(VEC_SIZE * 2 + 1), %ecx
>> >> >         addl    %ecx, %eax
>> >> > +#  endif
>> >> >  # else
>> >> >         subl    %edx, %edi
>> >> >         addl    $(VEC_SIZE * 2 + 1), %edi
>> >> >         addl    %edi, %eax
>> >> >  # endif
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrl    $2, %eax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -173,14 +194,19 @@ L(first_vec_x4):
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* Use ecx which was computed earlier to compute correct value.
>> >> >          */
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       leal    -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
>> >> > +#  else
>> >> >         subl    $(VEC_SIZE + 1), %ecx
>> >> >         addl    %ecx, %eax
>> >> > +#  endif
>> >> >  # else
>> >> >         subl    %edx, %edi
>> >> >         addl    $(VEC_SIZE * 3 + 1), %edi
>> >> >         addl    %edi, %eax
>> >> >  # endif
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrl    $2, %eax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -195,10 +221,14 @@ L(cross_page_continue):
>> >> >         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>> >> >            since data is only aligned to VEC_SIZE.  */
>> >> >  # ifdef USE_AS_STRNLEN
>> >> > -       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
>> >> > -          it simplies the logic in last_4x_vec_or_less.  */
>> >> > +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
>> >> > +          because it simplies the logic in last_4x_vec_or_less.  */
>> >> >         leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
>> >> >         subq    %rdx, %rcx
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
>> >> > +       sarl    $2, %ecx
>> >> > +#  endif
>> >> >  # endif
>> >> >         /* Load first VEC regardless.  */
>> >> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
>> >> > @@ -207,34 +237,38 @@ L(cross_page_continue):
>> >> >         subq    %rcx, %rsi
>> >> >         jb      L(last_4x_vec_or_less)
>> >> >  # endif
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(first_vec_x1)
>> >> >
>> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(first_vec_x2)
>> >> >
>> >> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(first_vec_x3)
>> >> >
>> >> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(first_vec_x4)
>> >> >
>> >> >         /* Align data to VEC_SIZE * 4 - 1.  */
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* Before adjusting length check if at last VEC_SIZE * 4.  */
>> >> > -       cmpq    $(VEC_SIZE * 4 - 1), %rsi
>> >> > +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
>> >> >         jbe     L(last_4x_vec_or_less_load)
>> >> >         incq    %rdi
>> >> >         movl    %edi, %ecx
>> >> >         orq     $(VEC_SIZE * 4 - 1), %rdi
>> >> >         andl    $(VEC_SIZE * 4 - 1), %ecx
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
>> >> > +       sarl    $2, %ecx
>> >> > +#  endif
>> >> >         /* Readjust length.  */
>> >> >         addq    %rcx, %rsi
>> >> >  # else
>> >> > @@ -246,13 +280,13 @@ L(cross_page_continue):
>> >> >  L(loop_4x_vec):
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         /* Break if at end of length.  */
>> >> > -       subq    $(VEC_SIZE * 4), %rsi
>> >> > +       subq    $(CHAR_PER_VEC * 4), %rsi
>> >> >         jb      L(last_4x_vec_or_less_cmpeq)
>> >> >  # endif
>> >> > -       /* Save some code size by microfusing VPMINU with the load. Since
>> >> > -          the matches in ymm2/ymm4 can only be returned if there where no
>> >> > -          matches in ymm1/ymm3 respectively there is no issue with overlap.
>> >> > -        */
>> >> > +       /* Save some code size by microfusing VPMINU with the load.
>> >> > +          Since the matches in ymm2/ymm4 can only be returned if there
>> >> > +          where no matches in ymm1/ymm3 respectively there is no issue
>> >> > +          with overlap.  */
>> >> >         vmovdqa 1(%rdi), %ymm1
>> >> >         VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
>> >> >         vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
>> >> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
>> >> >
>> >> >         VPMINU  %ymm2, %ymm4, %ymm5
>> >> >         VPCMPEQ %ymm5, %ymm0, %ymm5
>> >> > -       vpmovmskb       %ymm5, %ecx
>> >> > +       vpmovmskb %ymm5, %ecx
>> >> >
>> >> >         subq    $-(VEC_SIZE * 4), %rdi
>> >> >         testl   %ecx, %ecx
>> >> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
>> >> >
>> >> >
>> >> >         VPCMPEQ %ymm1, %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         subq    %rdx, %rdi
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(last_vec_return_x0)
>> >> >
>> >> >         VPCMPEQ %ymm2, %ymm0, %ymm2
>> >> > -       vpmovmskb       %ymm2, %eax
>> >> > +       vpmovmskb %ymm2, %eax
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(last_vec_return_x1)
>> >> >
>> >> >         /* Combine last 2 VEC.  */
>> >> >         VPCMPEQ %ymm3, %ymm0, %ymm3
>> >> > -       vpmovmskb       %ymm3, %eax
>> >> > -       /* rcx has combined result from all 4 VEC. It will only be used if
>> >> > -          the first 3 other VEC all did not contain a match.  */
>> >> > +       vpmovmskb %ymm3, %eax
>> >> > +       /* rcx has combined result from all 4 VEC. It will only be used
>> >> > +          if the first 3 other VEC all did not contain a match.  */
>> >> >         salq    $32, %rcx
>> >> >         orq     %rcx, %rax
>> >> >         tzcntq  %rax, %rax
>> >> >         subq    $(VEC_SIZE * 2 - 1), %rdi
>> >> >         addq    %rdi, %rax
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
>> >> >  # ifdef USE_AS_STRNLEN
>> >> >         .p2align 4
>> >> >  L(last_4x_vec_or_less_load):
>> >> > -       /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
>> >> > +       /* Depending on entry adjust rdi / prepare first VEC in ymm1.
>> >> > +        */
>> >> >         subq    $-(VEC_SIZE * 4), %rdi
>> >> >  L(last_4x_vec_or_less_cmpeq):
>> >> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
>> >> >  L(last_4x_vec_or_less):
>> >> > -
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > -       /* If remaining length > VEC_SIZE * 2. This works if esi is off by
>> >> > -          VEC_SIZE * 4.  */
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Multiply length by 4 to get byte count.  */
>> >> > +       sall    $2, %esi
>> >> > +#  endif
>> >> > +       vpmovmskb %ymm1, %eax
>> >> > +       /* If remaining length > VEC_SIZE * 2. This works if esi is off
>> >> > +          by VEC_SIZE * 4.  */
>> >> >         testl   $(VEC_SIZE * 2), %esi
>> >> >         jnz     L(last_4x_vec)
>> >> >
>> >> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
>> >> >         jb      L(max)
>> >> >
>> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         tzcntl  %eax, %eax
>> >> >         /* Check the end of data.  */
>> >> >         cmpl    %eax, %esi
>> >> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
>> >> >         addl    $(VEC_SIZE + 1), %eax
>> >> >         addq    %rdi, %rax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  #  endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
>> >> >         subq    $(VEC_SIZE * 4 - 1), %rdi
>> >> >         addq    %rdi, %rax
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
>> >> >         subq    $(VEC_SIZE * 3 - 1), %rdi
>> >> >         addq    %rdi, %rax
>> >> >  # ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  # endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
>> >> >         incl    %eax
>> >> >         addq    %rdi, %rax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  #  endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -381,14 +424,14 @@ L(last_4x_vec):
>> >> >         jnz     L(last_vec_x1)
>> >> >
>> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(last_vec_x2)
>> >> >
>> >> >         /* Normalize length.  */
>> >> >         andl    $(VEC_SIZE * 4 - 1), %esi
>> >> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         testl   %eax, %eax
>> >> >         jnz     L(last_vec_x3)
>> >> >
>> >> > @@ -396,7 +439,7 @@ L(last_4x_vec):
>> >> >         jb      L(max)
>> >> >
>> >> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         tzcntl  %eax, %eax
>> >> >         /* Check the end of data.  */
>> >> >         cmpl    %eax, %esi
>> >> > @@ -405,6 +448,7 @@ L(last_4x_vec):
>> >> >         addl    $(VEC_SIZE * 3 + 1), %eax
>> >> >         addq    %rdi, %rax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  #  endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -419,6 +463,7 @@ L(last_vec_x1):
>> >> >         incl    %eax
>> >> >         addq    %rdi, %rax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  #  endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -432,6 +477,7 @@ L(last_vec_x2):
>> >> >         addl    $(VEC_SIZE + 1), %eax
>> >> >         addq    %rdi, %rax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  #  endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -447,6 +493,7 @@ L(last_vec_x3):
>> >> >         addl    $(VEC_SIZE * 2 + 1), %eax
>> >> >         addq    %rdi, %rax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> >         shrq    $2, %rax
>> >> >  #  endif
>> >> >         VZEROUPPER_RETURN
>> >> > @@ -455,13 +502,13 @@ L(max_end):
>> >> >         VZEROUPPER_RETURN
>> >> >  # endif
>> >> >
>> >> > -       /* Cold case for crossing page with first load.  */
>> >> > +       /* Cold case for crossing page with first load.  */
>> >> >         .p2align 4
>> >> >  L(cross_page_boundary):
>> >> >         /* Align data to VEC_SIZE - 1.  */
>> >> >         orq     $(VEC_SIZE - 1), %rdi
>> >> >         VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
>> >> > -       vpmovmskb       %ymm1, %eax
>> >> > +       vpmovmskb %ymm1, %eax
>> >> >         /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
>> >> >            so no need to manually mod rdx.  */
>> >> >         sarxl   %edx, %eax, %eax
>> >> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
>> >> >         jnz     L(cross_page_less_vec)
>> >> >         leaq    1(%rdi), %rcx
>> >> >         subq    %rdx, %rcx
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
>> >> > +       shrl    $2, %ecx
>> >> > +#  endif
>> >> >         /* Check length.  */
>> >> >         cmpq    %rsi, %rcx
>> >> >         jb      L(cross_page_continue)
>> >> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
>> >> >         jz      L(cross_page_continue)
>> >> >         tzcntl  %eax, %eax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Divide length by 4 to get wchar_t count.  */
>> >> >         shrl    $2, %eax
>> >> >  #  endif
>> >> >  # endif
>> >> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
>> >> >         .p2align 4
>> >> >  L(cross_page_less_vec):
>> >> >         tzcntl  %eax, %eax
>> >> > +#  ifdef USE_AS_WCSLEN
>> >> > +       /* NB: Multiply length by 4 to get byte count.  */
>> >> > +       sall    $2, %esi
>> >> > +#  endif
>> >> >         cmpq    %rax, %rsi
>> >> >         cmovb   %esi, %eax
>> >> >  #  ifdef USE_AS_WCSLEN
>> >> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
>> >> > index d223ea1700..3fc6734910 100644
>> >> > --- a/sysdeps/x86_64/strlen.S
>> >> > +++ b/sysdeps/x86_64/strlen.S
>> >> > @@ -65,12 +65,24 @@ ENTRY(strlen)
>> >> >         ret
>> >> >  L(n_nonzero):
>> >> >  # ifdef AS_WCSLEN
>> >> > -       shl     $2, %RSI_LP
>> >> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
>> >> > +   overflow the only way this program doesn't have undefined behavior
>> >> > +   is if there is a null terminator in valid memory so strlen will
>> >> > +   suffice.  */
>> >> > +       mov     %RSI_LP, %R10_LP
>> >> > +       sar     $62, %R10_LP
>> >> > +       test    %R10_LP, %R10_LP
>> >> > +       jnz     __wcslen_sse2
>> >>
>> >> Branch to  __wcslen_sse2 is wrong for 2 reasons:
>> >>
>> >> 1.  __wcslen_sse2 is undefined with --disable-multi-arch.
>> >
>> > Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?
>> >
>> >>
>> >> 2. You should skip ENDBR64 at function entry.
>> >>
>> >> Please create a new label and branch to it.
>> >>
>> > I am not quite sure how to do this. I am trying to use
>> > strstr-sse2-unaligned.S as a template:
>> > https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
>> > which appears to make a direct call to the global label of __strchr_sse2
>> > without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.
>>
>>
>> This is different since all files are in sysdeps/x86_64/multiarch.
>
>
> I see. So it turns out we are missing wcslen_sse4_1 which strlen.S
> can also implement (it passes all tests). Would jumping to that be
> valid?
>
> Otherwise I think the best bet is to add a target  for wcslen_sse4_1
> and define it and wcsnlen_sse4_1 in the same file so the label is visible.
> The only issue is the #defines in strlen.S need to all be protected which
> is a bit messy. If we don't want to define wcslen_sse4_1 for whatever
> reason, I already have this approach working with defining
> wcsnlen_sse4_1 in the same file as wcslen-sse2.S and entering from
> a local label. But looking at the code it seems the strlen.S file is a bit
> better optimized. Thoughts?
>

I see what is going on.  I was confused by SSE4 codes in strlen.S.
I submitted a patch to move it to multiarch/strlen-vec.S.

Yes, we should add wcslen_sse4_1.   My question is why we need
to branch from __wcsnlen_sse4_1 to __strlen_sse2 with overflow?
Can you make __wcsnlen_sse4_1 to properly handle it directly?
Noah Goldstein June 23, 2021, 4:55 a.m. UTC | #6
On Tue, Jun 22, 2021 at 11:59 PM H.J. Lu <hjl.tools@gmail.com> wrote:

>  On Tue, Jun 22, 2021 at 8:11 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> >
> >
> >
> > On Tue, Jun 22, 2021 at 7:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> On Tue, Jun 22, 2021 at 4:16 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >> >
> >> >
> >> >
> >> > On Tue, Jun 22, 2021 at 5:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >> >>
> >> >> On Tue, Jun 22, 2021 at 11:20 AM Noah Goldstein <
> goldstein.w.n@gmail.com> wrote:
> >> >> >
> >> >> > This commit fixes the bug mentioned in the previous commit.
> >> >> >
> >> >> > The previous implementations of wmemchr in these files relied
> >> >> > on maxlen * sizeof(wchar_t) which was not guranteed by the
> standard.
> >> >> >
> >> >> > The new overflow tests added in the previous commit now
> >> >> > pass (As well as all the other tests).
> >> >> >
> >> >> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> >> >> > ---
> >> >> >  sysdeps/x86_64/multiarch/strlen-avx2.S | 130
> ++++++++++++++++++-------
> >> >> >  sysdeps/x86_64/strlen.S                |  14 ++-
> >> >> >  2 files changed, 106 insertions(+), 38 deletions(-)
> >> >> >
> >> >> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S
> b/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> >> > index bd2e6ee44a..b282a75613 100644
> >> >> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> >> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> >> >> > @@ -44,21 +44,21 @@
> >> >> >
> >> >> >  # define VEC_SIZE 32
> >> >> >  # define PAGE_SIZE 4096
> >> >> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> >> >> >
> >> >> >         .section SECTION(.text),"ax",@progbits
> >> >> >  ENTRY (STRLEN)
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* Check zero length.  */
> >> >> > +#  ifdef __ILP32__
> >> >> > +       /* Clear upper bits.  */
> >> >> > +       and     %RSI_LP, %RSI_LP
> >> >> > +#  else
> >> >> >         test    %RSI_LP, %RSI_LP
> >> >> > +#  endif
> >> >> >         jz      L(zero)
> >> >> >         /* Store max len in R8_LP before adjusting if using
> WCSLEN.  */
> >> >> >         mov     %RSI_LP, %R8_LP
> >> >> > -#  ifdef USE_AS_WCSLEN
> >> >> > -       shl     $2, %RSI_LP
> >> >> > -#  elif defined __ILP32__
> >> >> > -       /* Clear the upper 32 bits.  */
> >> >> > -       movl    %esi, %esi
> >> >> > -#  endif
> >> >> >  # endif
> >> >> >         movl    %edi, %eax
> >> >> >         movq    %rdi, %rdx
> >> >> > @@ -72,10 +72,10 @@ ENTRY (STRLEN)
> >> >> >
> >> >> >         /* Check the first VEC_SIZE bytes.  */
> >> >> >         VPCMPEQ (%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* If length < VEC_SIZE handle special.  */
> >> >> > -       cmpq    $VEC_SIZE, %rsi
> >> >> > +       cmpq    $CHAR_PER_VEC, %rsi
> >> >> >         jbe     L(first_vec_x0)
> >> >> >  # endif
> >> >> >         /* If empty continue to aligned_more. Otherwise return bit
> >> >> > @@ -84,6 +84,7 @@ ENTRY (STRLEN)
> >> >> >         jz      L(aligned_more)
> >> >> >         tzcntl  %eax, %eax
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrl    $2, %eax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -97,9 +98,14 @@ L(zero):
> >> >> >  L(first_vec_x0):
> >> >> >         /* Set bit for max len so that tzcnt will return min of
> max len
> >> >> >            and position of first match.  */
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Multiply length by 4 to get byte count.  */
> >> >> > +       sall    $2, %esi
> >> >> > +#  endif
> >> >> >         btsq    %rsi, %rax
> >> >> >         tzcntl  %eax, %eax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrl    $2, %eax
> >> >> >  #  endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -113,14 +119,19 @@ L(first_vec_x1):
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >> >          */
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       leal    -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
> >> >> > +#  else
> >> >> >         subl    $(VEC_SIZE * 4 + 1), %ecx
> >> >> >         addl    %ecx, %eax
> >> >> > +#  endif
> >> >> >  # else
> >> >> >         subl    %edx, %edi
> >> >> >         incl    %edi
> >> >> >         addl    %edi, %eax
> >> >> >  # endif
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrl    $2, %eax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -133,14 +144,19 @@ L(first_vec_x2):
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >> >          */
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       leal    -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
> >> >> > +#  else
> >> >> >         subl    $(VEC_SIZE * 3 + 1), %ecx
> >> >> >         addl    %ecx, %eax
> >> >> > +#  endif
> >> >> >  # else
> >> >> >         subl    %edx, %edi
> >> >> >         addl    $(VEC_SIZE + 1), %edi
> >> >> >         addl    %edi, %eax
> >> >> >  # endif
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrl    $2, %eax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -153,14 +169,19 @@ L(first_vec_x3):
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >> >          */
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       leal    -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
> >> >> > +#  else
> >> >> >         subl    $(VEC_SIZE * 2 + 1), %ecx
> >> >> >         addl    %ecx, %eax
> >> >> > +#  endif
> >> >> >  # else
> >> >> >         subl    %edx, %edi
> >> >> >         addl    $(VEC_SIZE * 2 + 1), %edi
> >> >> >         addl    %edi, %eax
> >> >> >  # endif
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrl    $2, %eax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -173,14 +194,19 @@ L(first_vec_x4):
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* Use ecx which was computed earlier to compute correct
> value.
> >> >> >          */
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       leal    -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
> >> >> > +#  else
> >> >> >         subl    $(VEC_SIZE + 1), %ecx
> >> >> >         addl    %ecx, %eax
> >> >> > +#  endif
> >> >> >  # else
> >> >> >         subl    %edx, %edi
> >> >> >         addl    $(VEC_SIZE * 3 + 1), %edi
> >> >> >         addl    %edi, %eax
> >> >> >  # endif
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrl    $2, %eax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -195,10 +221,14 @@ L(cross_page_continue):
> >> >> >         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a
> time
> >> >> >            since data is only aligned to VEC_SIZE.  */
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> > -       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> because
> >> >> > -          it simplies the logic in last_4x_vec_or_less.  */
> >> >> > +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
> >> >> > +          because it simplies the logic in last_4x_vec_or_less.
> */
> >> >> >         leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> >> >> >         subq    %rdx, %rcx
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> >> >> > +       sarl    $2, %ecx
> >> >> > +#  endif
> >> >> >  # endif
> >> >> >         /* Load first VEC regardless.  */
> >> >> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >> >> > @@ -207,34 +237,38 @@ L(cross_page_continue):
> >> >> >         subq    %rcx, %rsi
> >> >> >         jb      L(last_4x_vec_or_less)
> >> >> >  # endif
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(first_vec_x1)
> >> >> >
> >> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(first_vec_x2)
> >> >> >
> >> >> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(first_vec_x3)
> >> >> >
> >> >> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(first_vec_x4)
> >> >> >
> >> >> >         /* Align data to VEC_SIZE * 4 - 1.  */
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* Before adjusting length check if at last VEC_SIZE * 4.
> */
> >> >> > -       cmpq    $(VEC_SIZE * 4 - 1), %rsi
> >> >> > +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
> >> >> >         jbe     L(last_4x_vec_or_less_load)
> >> >> >         incq    %rdi
> >> >> >         movl    %edi, %ecx
> >> >> >         orq     $(VEC_SIZE * 4 - 1), %rdi
> >> >> >         andl    $(VEC_SIZE * 4 - 1), %ecx
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> >> >> > +       sarl    $2, %ecx
> >> >> > +#  endif
> >> >> >         /* Readjust length.  */
> >> >> >         addq    %rcx, %rsi
> >> >> >  # else
> >> >> > @@ -246,13 +280,13 @@ L(cross_page_continue):
> >> >> >  L(loop_4x_vec):
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         /* Break if at end of length.  */
> >> >> > -       subq    $(VEC_SIZE * 4), %rsi
> >> >> > +       subq    $(CHAR_PER_VEC * 4), %rsi
> >> >> >         jb      L(last_4x_vec_or_less_cmpeq)
> >> >> >  # endif
> >> >> > -       /* Save some code size by microfusing VPMINU with the
> load. Since
> >> >> > -          the matches in ymm2/ymm4 can only be returned if there
> where no
> >> >> > -          matches in ymm1/ymm3 respectively there is no issue
> with overlap.
> >> >> > -        */
> >> >> > +       /* Save some code size by microfusing VPMINU with the load.
> >> >> > +          Since the matches in ymm2/ymm4 can only be returned if
> there
> >> >> > +          where no matches in ymm1/ymm3 respectively there is no
> issue
> >> >> > +          with overlap.  */
> >> >> >         vmovdqa 1(%rdi), %ymm1
> >> >> >         VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> >> >> >         vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> >> >> > @@ -260,7 +294,7 @@ L(loop_4x_vec):
> >> >> >
> >> >> >         VPMINU  %ymm2, %ymm4, %ymm5
> >> >> >         VPCMPEQ %ymm5, %ymm0, %ymm5
> >> >> > -       vpmovmskb       %ymm5, %ecx
> >> >> > +       vpmovmskb %ymm5, %ecx
> >> >> >
> >> >> >         subq    $-(VEC_SIZE * 4), %rdi
> >> >> >         testl   %ecx, %ecx
> >> >> > @@ -268,27 +302,28 @@ L(loop_4x_vec):
> >> >> >
> >> >> >
> >> >> >         VPCMPEQ %ymm1, %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         subq    %rdx, %rdi
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(last_vec_return_x0)
> >> >> >
> >> >> >         VPCMPEQ %ymm2, %ymm0, %ymm2
> >> >> > -       vpmovmskb       %ymm2, %eax
> >> >> > +       vpmovmskb %ymm2, %eax
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(last_vec_return_x1)
> >> >> >
> >> >> >         /* Combine last 2 VEC.  */
> >> >> >         VPCMPEQ %ymm3, %ymm0, %ymm3
> >> >> > -       vpmovmskb       %ymm3, %eax
> >> >> > -       /* rcx has combined result from all 4 VEC. It will only be
> used if
> >> >> > -          the first 3 other VEC all did not contain a match.  */
> >> >> > +       vpmovmskb %ymm3, %eax
> >> >> > +       /* rcx has combined result from all 4 VEC. It will only be
> used
> >> >> > +          if the first 3 other VEC all did not contain a match.
> */
> >> >> >         salq    $32, %rcx
> >> >> >         orq     %rcx, %rax
> >> >> >         tzcntq  %rax, %rax
> >> >> >         subq    $(VEC_SIZE * 2 - 1), %rdi
> >> >> >         addq    %rdi, %rax
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -297,15 +332,19 @@ L(loop_4x_vec):
> >> >> >  # ifdef USE_AS_STRNLEN
> >> >> >         .p2align 4
> >> >> >  L(last_4x_vec_or_less_load):
> >> >> > -       /* Depending on entry adjust rdi / prepare first VEC in
> ymm1.  */
> >> >> > +       /* Depending on entry adjust rdi / prepare first VEC in
> ymm1.
> >> >> > +        */
> >> >> >         subq    $-(VEC_SIZE * 4), %rdi
> >> >> >  L(last_4x_vec_or_less_cmpeq):
> >> >> >         VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >> >> >  L(last_4x_vec_or_less):
> >> >> > -
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > -       /* If remaining length > VEC_SIZE * 2. This works if esi
> is off by
> >> >> > -          VEC_SIZE * 4.  */
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Multiply length by 4 to get byte count.  */
> >> >> > +       sall    $2, %esi
> >> >> > +#  endif
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> > +       /* If remaining length > VEC_SIZE * 2. This works if esi
> is off
> >> >> > +          by VEC_SIZE * 4.  */
> >> >> >         testl   $(VEC_SIZE * 2), %esi
> >> >> >         jnz     L(last_4x_vec)
> >> >> >
> >> >> > @@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
> >> >> >         jb      L(max)
> >> >> >
> >> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         tzcntl  %eax, %eax
> >> >> >         /* Check the end of data.  */
> >> >> >         cmpl    %eax, %esi
> >> >> > @@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
> >> >> >         addl    $(VEC_SIZE + 1), %eax
> >> >> >         addq    %rdi, %rax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  #  endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -340,6 +380,7 @@ L(last_vec_return_x0):
> >> >> >         subq    $(VEC_SIZE * 4 - 1), %rdi
> >> >> >         addq    %rdi, %rax
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -350,6 +391,7 @@ L(last_vec_return_x1):
> >> >> >         subq    $(VEC_SIZE * 3 - 1), %rdi
> >> >> >         addq    %rdi, %rax
> >> >> >  # ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  # endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -366,6 +408,7 @@ L(last_vec_x1_check):
> >> >> >         incl    %eax
> >> >> >         addq    %rdi, %rax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  #  endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -381,14 +424,14 @@ L(last_4x_vec):
> >> >> >         jnz     L(last_vec_x1)
> >> >> >
> >> >> >         VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(last_vec_x2)
> >> >> >
> >> >> >         /* Normalize length.  */
> >> >> >         andl    $(VEC_SIZE * 4 - 1), %esi
> >> >> >         VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         testl   %eax, %eax
> >> >> >         jnz     L(last_vec_x3)
> >> >> >
> >> >> > @@ -396,7 +439,7 @@ L(last_4x_vec):
> >> >> >         jb      L(max)
> >> >> >
> >> >> >         VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         tzcntl  %eax, %eax
> >> >> >         /* Check the end of data.  */
> >> >> >         cmpl    %eax, %esi
> >> >> > @@ -405,6 +448,7 @@ L(last_4x_vec):
> >> >> >         addl    $(VEC_SIZE * 3 + 1), %eax
> >> >> >         addq    %rdi, %rax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  #  endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -419,6 +463,7 @@ L(last_vec_x1):
> >> >> >         incl    %eax
> >> >> >         addq    %rdi, %rax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  #  endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -432,6 +477,7 @@ L(last_vec_x2):
> >> >> >         addl    $(VEC_SIZE + 1), %eax
> >> >> >         addq    %rdi, %rax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  #  endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -447,6 +493,7 @@ L(last_vec_x3):
> >> >> >         addl    $(VEC_SIZE * 2 + 1), %eax
> >> >> >         addq    %rdi, %rax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> >         shrq    $2, %rax
> >> >> >  #  endif
> >> >> >         VZEROUPPER_RETURN
> >> >> > @@ -455,13 +502,13 @@ L(max_end):
> >> >> >         VZEROUPPER_RETURN
> >> >> >  # endif
> >> >> >
> >> >> > -       /* Cold case for crossing page with first load.  */
> >> >> > +       /* Cold case for crossing page with first load.  */
> >> >> >         .p2align 4
> >> >> >  L(cross_page_boundary):
> >> >> >         /* Align data to VEC_SIZE - 1.  */
> >> >> >         orq     $(VEC_SIZE - 1), %rdi
> >> >> >         VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> >> >> > -       vpmovmskb       %ymm1, %eax
> >> >> > +       vpmovmskb %ymm1, %eax
> >> >> >         /* Remove the leading bytes. sarxl only uses bits [5:0] of
> COUNT
> >> >> >            so no need to manually mod rdx.  */
> >> >> >         sarxl   %edx, %eax, %eax
> >> >> > @@ -470,6 +517,10 @@ L(cross_page_boundary):
> >> >> >         jnz     L(cross_page_less_vec)
> >> >> >         leaq    1(%rdi), %rcx
> >> >> >         subq    %rdx, %rcx
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide bytes by 4 to get wchar_t count.  */
> >> >> > +       shrl    $2, %ecx
> >> >> > +#  endif
> >> >> >         /* Check length.  */
> >> >> >         cmpq    %rsi, %rcx
> >> >> >         jb      L(cross_page_continue)
> >> >> > @@ -479,6 +530,7 @@ L(cross_page_boundary):
> >> >> >         jz      L(cross_page_continue)
> >> >> >         tzcntl  %eax, %eax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Divide length by 4 to get wchar_t count.  */
> >> >> >         shrl    $2, %eax
> >> >> >  #  endif
> >> >> >  # endif
> >> >> > @@ -489,6 +541,10 @@ L(return_vzeroupper):
> >> >> >         .p2align 4
> >> >> >  L(cross_page_less_vec):
> >> >> >         tzcntl  %eax, %eax
> >> >> > +#  ifdef USE_AS_WCSLEN
> >> >> > +       /* NB: Multiply length by 4 to get byte count.  */
> >> >> > +       sall    $2, %esi
> >> >> > +#  endif
> >> >> >         cmpq    %rax, %rsi
> >> >> >         cmovb   %esi, %eax
> >> >> >  #  ifdef USE_AS_WCSLEN
> >> >> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> >> >> > index d223ea1700..3fc6734910 100644
> >> >> > --- a/sysdeps/x86_64/strlen.S
> >> >> > +++ b/sysdeps/x86_64/strlen.S
> >> >> > @@ -65,12 +65,24 @@ ENTRY(strlen)
> >> >> >         ret
> >> >> >  L(n_nonzero):
> >> >> >  # ifdef AS_WCSLEN
> >> >> > -       shl     $2, %RSI_LP
> >> >> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> >> >> > +   overflow the only way this program doesn't have undefined
> behavior
> >> >> > +   is if there is a null terminator in valid memory so strlen will
> >> >> > +   suffice.  */
> >> >> > +       mov     %RSI_LP, %R10_LP
> >> >> > +       sar     $62, %R10_LP
> >> >> > +       test    %R10_LP, %R10_LP
> >> >> > +       jnz     __wcslen_sse2
> >> >>
> >> >> Branch to  __wcslen_sse2 is wrong for 2 reasons:
> >> >>
> >> >> 1.  __wcslen_sse2 is undefined with --disable-multi-arch.
> >> >
> >> > Won't __wcsnlen_sse2 be undefined with --disable-multi-arch as well?
> >> >
> >> >>
> >> >> 2. You should skip ENDBR64 at function entry.
> >> >>
> >> >> Please create a new label and branch to it.
> >> >>
> >> > I am not quite sure how to do this. I am trying to use
> >> > strstr-sse2-unaligned.S as a template:
> >> >
> https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S;h=21e1a5f7cfde8ec07fcc4fc80d26984a58d651d7;hb=HEAD#l78
> >> > which appears to make a direct call to the global label of
> __strchr_sse2
> >> > without anything special in strchr-sse2.S or strstr-sse2-unaligned.S.
> >>
> >>
> >> This is different since all files are in sysdeps/x86_64/multiarch.
> >
> >
> > I see. So it turns out we are missing wcslen_sse4_1 which strlen.S
> > can also implement (it passes all tests). Would jumping to that be
> > valid?
> >
> > Otherwise I think the best bet is to add a target  for wcslen_sse4_1
> > and define it and wcsnlen_sse4_1 in the same file so the label is
> visible.
> > The only issue is the #defines in strlen.S need to all be protected which
> > is a bit messy. If we don't want to define wcslen_sse4_1 for whatever
> > reason, I already have this approach working with defining
> > wcsnlen_sse4_1 in the same file as wcslen-sse2.S and entering from
> > a local label. But looking at the code it seems the strlen.S file is a
> bit
> > better optimized. Thoughts?
> >
>
> I see what is going on.  I was confused by SSE4 codes in strlen.S.
> I submitted a patch to move it to multiarch/strlen-vec.S.


> Yes, we should add wcslen_sse4_1.   My question is why we need
> to branch from __wcsnlen_sse4_1 to __strlen_sse2 with overflow?
> Can you make __wcsnlen_sse4_1 to properly handle it directly?
>
> The current approach makes it non-trivial:

# define STRNLEN_PROLOG \
mov %r11, %rsi; \
subq %rax, %rsi; \
andq $-64, %rax; \
testq $-64, %rsi; \
je L(strnlen_ret)

AFAICT forces the length to be in bytes and rewriting that
affects the entire file's logic.

I considered porting the avx2 solution but I don't think it really fits
since the results from 4x vec all fit in a 64 bit register and the vast
improvement of the branch predictor on machines that use avx2.

I also think in the overflow case it is likely faster going through
wcslen as given that all the length bookkeeping / branches
can be dropped although it definitely does pessimize the common
no-overflow case.


> --
> H.J.
>
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index bd2e6ee44a..b282a75613 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -44,21 +44,21 @@ 
 
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
 	/* Check zero length.  */
+#  ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RSI_LP, %RSI_LP
+#  else
 	test	%RSI_LP, %RSI_LP
+#  endif
 	jz	L(zero)
 	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
 	mov	%RSI_LP, %R8_LP
-#  ifdef USE_AS_WCSLEN
-	shl	$2, %RSI_LP
-#  elif defined __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-#  endif
 # endif
 	movl	%edi, %eax
 	movq	%rdi, %rdx
@@ -72,10 +72,10 @@  ENTRY (STRLEN)
 
 	/* Check the first VEC_SIZE bytes.  */
 	VPCMPEQ	(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 # ifdef USE_AS_STRNLEN
 	/* If length < VEC_SIZE handle special.  */
-	cmpq	$VEC_SIZE, %rsi
+	cmpq	$CHAR_PER_VEC, %rsi
 	jbe	L(first_vec_x0)
 # endif
 	/* If empty continue to aligned_more. Otherwise return bit
@@ -84,6 +84,7 @@  ENTRY (STRLEN)
 	jz	L(aligned_more)
 	tzcntl	%eax, %eax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -97,9 +98,14 @@  L(zero):
 L(first_vec_x0):
 	/* Set bit for max len so that tzcnt will return min of max len
 	   and position of first match.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
 	btsq	%rsi, %rax
 	tzcntl	%eax, %eax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 #  endif
 	VZEROUPPER_RETURN
@@ -113,14 +119,19 @@  L(first_vec_x1):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 4 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	incl	%edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -133,14 +144,19 @@  L(first_vec_x2):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 3 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -153,14 +169,19 @@  L(first_vec_x3):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE * 2 + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE * 2 + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -173,14 +194,19 @@  L(first_vec_x4):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
+#  else
 	subl	$(VEC_SIZE + 1), %ecx
 	addl	%ecx, %eax
+#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE * 3 + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -195,10 +221,14 @@  L(cross_page_continue):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
 # ifdef USE_AS_STRNLEN
-	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
-	   it simplies the logic in last_4x_vec_or_less.  */
+	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
+	   because it simplies the logic in last_4x_vec_or_less.  */
 	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
 	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 # endif
 	/* Load first VEC regardless.  */
 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
@@ -207,34 +237,38 @@  L(cross_page_continue):
 	subq	%rcx, %rsi
 	jb	L(last_4x_vec_or_less)
 # endif
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x4)
 
 	/* Align data to VEC_SIZE * 4 - 1.  */
 # ifdef USE_AS_STRNLEN
 	/* Before adjusting length check if at last VEC_SIZE * 4.  */
-	cmpq	$(VEC_SIZE * 4 - 1), %rsi
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
 	jbe	L(last_4x_vec_or_less_load)
 	incq	%rdi
 	movl	%edi, %ecx
 	orq	$(VEC_SIZE * 4 - 1), %rdi
 	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 	/* Readjust length.  */
 	addq	%rcx, %rsi
 # else
@@ -246,13 +280,13 @@  L(cross_page_continue):
 L(loop_4x_vec):
 # ifdef USE_AS_STRNLEN
 	/* Break if at end of length.  */
-	subq	$(VEC_SIZE * 4), %rsi
+	subq	$(CHAR_PER_VEC * 4), %rsi
 	jb	L(last_4x_vec_or_less_cmpeq)
 # endif
-	/* Save some code size by microfusing VPMINU with the load. Since
-	   the matches in ymm2/ymm4 can only be returned if there where no
-	   matches in ymm1/ymm3 respectively there is no issue with overlap.
-	 */
+	/* Save some code size by microfusing VPMINU with the load.
+	   Since the matches in ymm2/ymm4 can only be returned if there
+	   where no matches in ymm1/ymm3 respectively there is no issue
+	   with overlap.  */
 	vmovdqa	1(%rdi), %ymm1
 	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
 	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
@@ -260,7 +294,7 @@  L(loop_4x_vec):
 
 	VPMINU	%ymm2, %ymm4, %ymm5
 	VPCMPEQ	%ymm5, %ymm0, %ymm5
-	vpmovmskb	%ymm5, %ecx
+	vpmovmskb %ymm5, %ecx
 
 	subq	$-(VEC_SIZE * 4), %rdi
 	testl	%ecx, %ecx
@@ -268,27 +302,28 @@  L(loop_4x_vec):
 
 
 	VPCMPEQ	%ymm1, %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	subq	%rdx, %rdi
 	testl	%eax, %eax
 	jnz	L(last_vec_return_x0)
 
 	VPCMPEQ	%ymm2, %ymm0, %ymm2
-	vpmovmskb	%ymm2, %eax
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_return_x1)
 
 	/* Combine last 2 VEC.  */
 	VPCMPEQ	%ymm3, %ymm0, %ymm3
-	vpmovmskb	%ymm3, %eax
-	/* rcx has combined result from all 4 VEC. It will only be used if
-	   the first 3 other VEC all did not contain a match.  */
+	vpmovmskb %ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used
+	   if the first 3 other VEC all did not contain a match.  */
 	salq	$32, %rcx
 	orq	%rcx, %rax
 	tzcntq	%rax, %rax
 	subq	$(VEC_SIZE * 2 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -297,15 +332,19 @@  L(loop_4x_vec):
 # ifdef USE_AS_STRNLEN
 	.p2align 4
 L(last_4x_vec_or_less_load):
-	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
+	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
+	 */
 	subq	$-(VEC_SIZE * 4), %rdi
 L(last_4x_vec_or_less_cmpeq):
 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 L(last_4x_vec_or_less):
-
-	vpmovmskb	%ymm1, %eax
-	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
-	   VEC_SIZE * 4.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
+	vpmovmskb %ymm1, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off
+	   by VEC_SIZE * 4.  */
 	testl	$(VEC_SIZE * 2), %esi
 	jnz	L(last_4x_vec)
 
@@ -320,7 +359,7 @@  L(last_4x_vec_or_less):
 	jb	L(max)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
 	cmpl	%eax, %esi
@@ -329,6 +368,7 @@  L(last_4x_vec_or_less):
 	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -340,6 +380,7 @@  L(last_vec_return_x0):
 	subq	$(VEC_SIZE * 4 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -350,6 +391,7 @@  L(last_vec_return_x1):
 	subq	$(VEC_SIZE * 3 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -366,6 +408,7 @@  L(last_vec_x1_check):
 	incl	%eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -381,14 +424,14 @@  L(last_4x_vec):
 	jnz	L(last_vec_x1)
 
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x2)
 
 	/* Normalize length.  */
 	andl	$(VEC_SIZE * 4 - 1), %esi
 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x3)
 
@@ -396,7 +439,7 @@  L(last_4x_vec):
 	jb	L(max)
 
 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
 	cmpl	%eax, %esi
@@ -405,6 +448,7 @@  L(last_4x_vec):
 	addl	$(VEC_SIZE * 3 + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -419,6 +463,7 @@  L(last_vec_x1):
 	incl	%eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -432,6 +477,7 @@  L(last_vec_x2):
 	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -447,6 +493,7 @@  L(last_vec_x3):
 	addl	$(VEC_SIZE * 2 + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -455,13 +502,13 @@  L(max_end):
 	VZEROUPPER_RETURN
 # endif
 
-	/* Cold case for crossing page with first load.	 */
+	/* Cold case for crossing page with first load.  */
 	.p2align 4
 L(cross_page_boundary):
 	/* Align data to VEC_SIZE - 1.  */
 	orq	$(VEC_SIZE - 1), %rdi
 	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
 	   so no need to manually mod rdx.  */
 	sarxl	%edx, %eax, %eax
@@ -470,6 +517,10 @@  L(cross_page_boundary):
 	jnz	L(cross_page_less_vec)
 	leaq	1(%rdi), %rcx
 	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %ecx
+#  endif
 	/* Check length.  */
 	cmpq	%rsi, %rcx
 	jb	L(cross_page_continue)
@@ -479,6 +530,7 @@  L(cross_page_boundary):
 	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide length by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 #  endif
 # endif
@@ -489,6 +541,10 @@  L(return_vzeroupper):
 	.p2align 4
 L(cross_page_less_vec):
 	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
 	cmpq	%rax, %rsi
 	cmovb	%esi, %eax
 #  ifdef USE_AS_WCSLEN
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index d223ea1700..3fc6734910 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -65,12 +65,24 @@  ENTRY(strlen)
 	ret
 L(n_nonzero):
 # ifdef AS_WCSLEN
-	shl	$2, %RSI_LP
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+   overflow the only way this program doesn't have undefined behavior 
+   is if there is a null terminator in valid memory so strlen will 
+   suffice.  */
+	mov	%RSI_LP, %R10_LP
+	sar	$62, %R10_LP
+	test	%R10_LP, %R10_LP
+	jnz	__wcslen_sse2
+	sal	$2, %RSI_LP
 # endif
 
 /* Initialize long lived registers.  */
 
 	add	%RDI_LP, %RSI_LP
+# ifdef AS_WCSLEN
+/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
+	jbe	__wcslen_sse2
+# endif
 	mov	%RSI_LP, %R10_LP
 	and	$-64, %R10_LP
 	mov	%RSI_LP, %R11_LP