[v6,4/8] x86: Optimize memrchr-sse2.S

Message ID 20220607041134.2369903-4-goldstein.w.n@gmail.com
State Committed
Commit 731feee3869550e93177e604604c1765d81de571
Headers
Series [v6,1/8] x86: Create header for VEC classes in x86 strings library |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Noah Goldstein June 7, 2022, 4:11 a.m. UTC
  The new code:
    1. prioritizes smaller lengths more.
    2. optimizes target placement more carefully.
    3. reuses logic more.
    4. fixes up various inefficiencies in the logic.

The total code size saving is: 394 bytes
Geometric Mean of all benchmarks New / Old: 0.874

Regressions:
    1. The page cross case is now colder, especially re-entry from the
       page cross case if a match is not found in the first VEC
       (roughly 50%). My general opinion with this patch is this is
       acceptable given the "coldness" of this case (less than 4%) and
       generally performance improvement in the other far more common
       cases.

    2. There are some regressions 5-15% for medium/large user-arg
       lengths that have a match in the first VEC. This is because the
       logic was rewritten to optimize finds in the first VEC if the
       user-arg length is shorter (where we see roughly 20-50%
       performance improvements). It is not always the case this is a
       regression. My intuition is some frontend quirk is partially
       explaining the data although I haven't been able to find the
       root cause.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++--------------------
 1 file changed, 292 insertions(+), 321 deletions(-)
  

Comments

H.J. Lu June 7, 2022, 6:04 p.m. UTC | #1
On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code:
>     1. prioritizes smaller lengths more.
>     2. optimizes target placement more carefully.
>     3. reuses logic more.
>     4. fixes up various inefficiencies in the logic.
>
> The total code size saving is: 394 bytes
> Geometric Mean of all benchmarks New / Old: 0.874
>
> Regressions:
>     1. The page cross case is now colder, especially re-entry from the
>        page cross case if a match is not found in the first VEC
>        (roughly 50%). My general opinion with this patch is this is
>        acceptable given the "coldness" of this case (less than 4%) and
>        generally performance improvement in the other far more common
>        cases.
>
>     2. There are some regressions 5-15% for medium/large user-arg
>        lengths that have a match in the first VEC. This is because the
>        logic was rewritten to optimize finds in the first VEC if the
>        user-arg length is shorter (where we see roughly 20-50%
>        performance improvements). It is not always the case this is a
>        regression. My intuition is some frontend quirk is partially
>        explaining the data although I haven't been able to find the
>        root cause.
>
> Full xcheck passes on x86_64.
> ---
>  sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++--------------------
>  1 file changed, 292 insertions(+), 321 deletions(-)
>
> diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
> index d1a9f47911..b0dffd2ae2 100644
> --- a/sysdeps/x86_64/memrchr.S
> +++ b/sysdeps/x86_64/memrchr.S
> @@ -18,362 +18,333 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #include <sysdep.h>
> +#define VEC_SIZE                       16
> +#define PAGE_SIZE                      4096
>
>         .text
> -ENTRY (__memrchr)
> -       movd    %esi, %xmm1
> -
> -       sub     $16, %RDX_LP
> -       jbe     L(length_less16)
> -
> -       punpcklbw       %xmm1, %xmm1
> -       punpcklbw       %xmm1, %xmm1
> -
> -       add     %RDX_LP, %RDI_LP
> -       pshufd  $0, %xmm1, %xmm1
> -
> -       movdqu  (%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -
> -/* Check if there is a match.  */
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0)
> -
> -       sub     $64, %rdi
> -       mov     %edi, %ecx
> -       and     $15, %ecx
> -       jz      L(loop_prolog)
> -
> -       add     $16, %rdi
> -       add     $16, %rdx
> -       and     $-16, %rdi
> -       sub     %rcx, %rdx
> -
> -       .p2align 4
> -L(loop_prolog):
> -       sub     $64, %rdx
> -       jbe     L(exit_loop)
> -
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       movdqa  32(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16)
> -
> -       movdqa  (%rdi), %xmm4
> -       pcmpeqb %xmm1, %xmm4
> -       pmovmskb        %xmm4, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0)
> -
> -       sub     $64, %rdi
> -       sub     $64, %rdx
> -       jbe     L(exit_loop)
> -
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       movdqa  32(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16)
> -
> -       movdqa  (%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0)
> -
> -       mov     %edi, %ecx
> -       and     $63, %ecx
> -       jz      L(align64_loop)
> -
> -       add     $64, %rdi
> -       add     $64, %rdx
> -       and     $-64, %rdi
> -       sub     %rcx, %rdx
> -
> -       .p2align 4
> -L(align64_loop):
> -       sub     $64, %rdi
> -       sub     $64, %rdx
> -       jbe     L(exit_loop)
> -
> -       movdqa  (%rdi), %xmm0
> -       movdqa  16(%rdi), %xmm2
> -       movdqa  32(%rdi), %xmm3
> -       movdqa  48(%rdi), %xmm4
> -
> -       pcmpeqb %xmm1, %xmm0
> -       pcmpeqb %xmm1, %xmm2
> -       pcmpeqb %xmm1, %xmm3
> -       pcmpeqb %xmm1, %xmm4
> -
> -       pmaxub  %xmm3, %xmm0
> -       pmaxub  %xmm4, %xmm2
> -       pmaxub  %xmm0, %xmm2
> -       pmovmskb        %xmm2, %eax
> -
> -       test    %eax, %eax
> -       jz      L(align64_loop)
> -
> -       pmovmskb        %xmm4, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm2
> -
> -       pcmpeqb %xmm1, %xmm2
> -       pcmpeqb (%rdi), %xmm1
> -
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16)
> -
> -       pmovmskb        %xmm1, %eax
> -       bsr     %eax, %eax
> -
> -       add     %rdi, %rax
> +ENTRY_P2ALIGN(__memrchr, 6)
> +#ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       mov     %RDX_LP, %RDX_LP
> +#endif
> +       movd    %esi, %xmm0
> +
> +       /* Get end pointer.  */
> +       leaq    (%rdx, %rdi), %rcx
> +
> +       punpcklbw %xmm0, %xmm0
> +       punpcklwd %xmm0, %xmm0
> +       pshufd  $0, %xmm0, %xmm0
> +
> +       /* Check if we can load 1x VEC without cross a page.  */
> +       testl   $(PAGE_SIZE - VEC_SIZE), %ecx
> +       jz      L(page_cross)
> +
> +       /* NB: This load happens regardless of whether rdx (len) is zero. Since
> +          it doesn't cross a page and the standard gurantees any pointer have
> +          at least one-valid byte this load must be safe. For the entire
> +          history of the x86 memrchr implementation this has been possible so
> +          no code "should" be relying on a zero-length check before this load.
> +          The zero-length check is moved to the page cross case because it is
> +          1) pretty cold and including it pushes the hot case len <= VEC_SIZE
> +          into 2-cache lines.  */
> +       movups  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subq    $VEC_SIZE, %rdx
> +       ja      L(more_1x_vec)
> +L(ret_vec_x0_test):
> +       /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
> +          zero.  */
> +       bsrl    %eax, %eax
> +       jz      L(ret_0)
> +       /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
> +          if out of bounds.  */
> +       addl    %edx, %eax
> +       jl      L(zero_0)
> +       /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
> +          ptr.  */
> +       addq    %rdi, %rax
> +L(ret_0):
>         ret
>
> -       .p2align 4
> -L(exit_loop):
> -       add     $64, %edx
> -       cmp     $32, %edx
> -       jbe     L(exit_loop_32)
> -
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       movdqa  32(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16_1)
> -       cmp     $48, %edx
> -       jbe     L(return_null)
> -
> -       pcmpeqb (%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0_1)
> -       xor     %eax, %eax
> +       .p2align 4,, 5
> +L(ret_vec_x0):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE)(%rcx, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(exit_loop_32):
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48_1)
> -       cmp     $16, %edx
> -       jbe     L(return_null)
> -
> -       pcmpeqb 32(%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32_1)
> -       xor     %eax, %eax
> +       .p2align 4,, 2
> +L(zero_0):
> +       xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(matches0):
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(matches16):
> -       bsr     %eax, %eax
> -       lea     16(%rax, %rdi), %rax
> -       ret
>
> -       .p2align 4
> -L(matches32):
> -       bsr     %eax, %eax
> -       lea     32(%rax, %rdi), %rax
> +       .p2align 4,, 8
> +L(more_1x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +       /* Align rcx (pointer to string).  */
> +       decq    %rcx
> +       andq    $-VEC_SIZE, %rcx
> +
> +       movq    %rcx, %rdx
> +       /* NB: We could consistenyl save 1-byte in this pattern with `movaps
> +          %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
> +          it adds more frontend uops (even if the moves can be eliminated) and
> +          some percentage of the time actual backend uops.  */
> +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       subq    %rdi, %rdx
> +       pmovmskb %xmm1, %eax
> +
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       ja      L(more_2x_vec)
> +L(last_2x_vec):
> +       subl    $VEC_SIZE, %edx
> +       jbe     L(ret_vec_x0_test)
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subl    $VEC_SIZE, %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_1)
> +       addl    %edx, %eax
> +       jl      L(zero_0)
> +       addq    %rdi, %rax
> +L(ret_1):
>         ret
>
> -       .p2align 4
> -L(matches48):
> -       bsr     %eax, %eax
> -       lea     48(%rax, %rdi), %rax
> +       /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
> +          causes the hot pause (length <= VEC_SIZE) to span multiple cache
> +          lines.  Naturally aligned % 16 to 8-bytes.  */
> +L(page_cross):
> +       /* Zero length check.  */
> +       testq   %rdx, %rdx
> +       jz      L(zero_0)
> +
> +       leaq    -1(%rcx), %r8
> +       andq    $-(VEC_SIZE), %r8
> +
> +       movaps  (%r8), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %esi
> +       /* Shift out negative alignment (because we are starting from endptr and
> +          working backwards).  */
> +       negl    %ecx
> +       /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
> +          explicitly.  */
> +       andl    $(VEC_SIZE - 1), %ecx
> +       shl     %cl, %esi
> +       movzwl  %si, %eax
> +       leaq    (%rdi, %rdx), %rcx
> +       cmpq    %rdi, %r8
> +       ja      L(more_1x_vec)
> +       subl    $VEC_SIZE, %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_2)
> +       addl    %edx, %eax
> +       jl      L(zero_1)
> +       addq    %rdi, %rax
> +L(ret_2):
>         ret
>
> -       .p2align 4
> -L(matches0_1):
> -       bsr     %eax, %eax
> -       sub     $64, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       add     %rdi, %rax
> +       /* Fits in aliging bytes.  */
> +L(zero_1):
> +       xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(matches16_1):
> -       bsr     %eax, %eax
> -       sub     $48, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       lea     16(%rdi, %rax), %rax
> +       .p2align 4,, 5
> +L(ret_vec_x1):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(matches32_1):
> -       bsr     %eax, %eax
> -       sub     $32, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       lea     32(%rdi, %rax), %rax
> -       ret
> +       .p2align 4,, 8
> +L(more_2x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
>
> -       .p2align 4
> -L(matches48_1):
> -       bsr     %eax, %eax
> -       sub     $16, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       lea     48(%rdi, %rax), %rax
> -       ret
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x1)
>
> -       .p2align 4
> -L(return_null):
> -       xor     %eax, %eax
> -       ret
>
> -       .p2align 4
> -L(length_less16_offset0):
> -       test    %edx, %edx
> -       jz      L(return_null)
> +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
>
> -       mov     %dl, %cl
> -       pcmpeqb (%rdi), %xmm1
> +       subq    $(VEC_SIZE * 4), %rdx
> +       ja      L(more_4x_vec)
>
> -       mov     $1, %edx
> -       sal     %cl, %edx
> -       sub     $1, %edx
> +       addl    $(VEC_SIZE), %edx
> +       jle     L(ret_vec_x2_test)
>
> -       pmovmskb        %xmm1, %eax
> +L(last_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x2)
>
> -       and     %edx, %eax
> -       test    %eax, %eax
> -       jz      L(return_null)
> +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
>
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> +       subl    $(VEC_SIZE), %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_3)
> +       addl    %edx, %eax
> +       jl      L(zero_2)
> +       addq    %rdi, %rax
> +L(ret_3):
>         ret
>
> -       .p2align 4
> -L(length_less16):
> -       punpcklbw       %xmm1, %xmm1
> -       punpcklbw       %xmm1, %xmm1
> -
> -       add     $16, %edx
> -
> -       pshufd  $0, %xmm1, %xmm1
> -
> -       mov     %edi, %ecx
> -       and     $15, %ecx
> -       jz      L(length_less16_offset0)
> -
> -       mov     %cl, %dh
> -       mov     %ecx, %esi
> -       add     %dl, %dh
> -       and     $-16, %rdi
> -
> -       sub     $16, %dh
> -       ja      L(length_less16_part2)
> -
> -       pcmpeqb (%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> -
> -       sar     %cl, %eax
> -       mov     %dl, %cl
> -
> -       mov     $1, %edx
> -       sal     %cl, %edx
> -       sub     $1, %edx
> -
> -       and     %edx, %eax
> -       test    %eax, %eax
> -       jz      L(return_null)
> -
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> -       add     %rsi, %rax
> +       .p2align 4,, 6
> +L(ret_vec_x2_test):
> +       bsrl    %eax, %eax
> +       jz      L(zero_2)
> +       addl    %edx, %eax
> +       jl      L(zero_2)
> +       addq    %rdi, %rax
>         ret
>
> -       .p2align 4
> -L(length_less16_part2):
> -       movdqa  16(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -
> -       mov     %dh, %cl
> -       mov     $1, %edx
> -       sal     %cl, %edx
> -       sub     $1, %edx
> -
> -       and     %edx, %eax
> +L(zero_2):
> +       xorl    %eax, %eax
> +       ret
>
> -       test    %eax, %eax
> -       jnz     L(length_less16_part2_return)
>
> -       pcmpeqb (%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> +       .p2align 4,, 5
> +L(ret_vec_x2):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> +       ret
>
> -       mov     %esi, %ecx
> -       sar     %cl, %eax
> -       test    %eax, %eax
> -       jz      L(return_null)
> +       .p2align 4,, 5
> +L(ret_vec_x3):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> +       ret
>
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> -       add     %rsi, %rax
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x2)
> +
> +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x3)
> +
> +       addq    $-(VEC_SIZE * 4), %rcx
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_4x_vec)
> +
> +       /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
> +          keeping the code from spilling to the next cache line.  */
> +       addq    $(VEC_SIZE * 4 - 1), %rcx
> +       andq    $-(VEC_SIZE * 4), %rcx
> +       leaq    (VEC_SIZE * 4)(%rdi), %rdx
> +       andq    $-(VEC_SIZE * 4), %rdx
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +       movaps  (VEC_SIZE * -1)(%rcx), %xmm1
> +       movaps  (VEC_SIZE * -2)(%rcx), %xmm2
> +       movaps  (VEC_SIZE * -3)(%rcx), %xmm3
> +       movaps  (VEC_SIZE * -4)(%rcx), %xmm4
> +       pcmpeqb %xmm0, %xmm1
> +       pcmpeqb %xmm0, %xmm2
> +       pcmpeqb %xmm0, %xmm3
> +       pcmpeqb %xmm0, %xmm4
> +
> +       por     %xmm1, %xmm2
> +       por     %xmm3, %xmm4
> +       por     %xmm2, %xmm4
> +
> +       pmovmskb %xmm4, %esi
> +       testl   %esi, %esi
> +       jnz     L(loop_end)
> +
> +       addq    $-(VEC_SIZE * 4), %rcx
> +       cmpq    %rdx, %rcx
> +       jne     L(loop_4x_vec)
> +
> +       subl    %edi, %edx
> +
> +       /* Ends up being 1-byte nop.  */
> +       .p2align 4,, 2
> +L(last_4x_vec):
> +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       cmpl    $(VEC_SIZE * 2), %edx
> +       jbe     L(last_2x_vec)
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_end)
> +
> +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subl    $(VEC_SIZE * 3), %edx
> +       ja      L(last_vec)
> +       bsrl    %eax, %eax
> +       jz      L(ret_4)
> +       addl    %edx, %eax
> +       jl      L(zero_3)
> +       addq    %rdi, %rax
> +L(ret_4):
>         ret
>
> -       .p2align 4
> -L(length_less16_part2_return):
> -       bsr     %eax, %eax
> -       lea     16(%rax, %rdi), %rax
> +       /* Ends up being 1-byte nop.  */
> +       .p2align 4,, 3
> +L(loop_end):
> +       pmovmskb %xmm1, %eax
> +       sall    $16, %eax
> +       jnz     L(ret_vec_end)
> +
> +       pmovmskb %xmm2, %eax
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_end)
> +
> +       pmovmskb %xmm3, %eax
> +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> +          then CHAR in VEC3 and bsrq will use that position.  */
> +       sall    $16, %eax
> +       orl     %esi, %eax
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
>         ret
>
> -END (__memrchr)
> +L(ret_vec_end):
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE * -2)(%rax, %rcx), %rax
> +       ret
> +       /* Use in L(last_4x_vec). In the same cache line. This is just a spare
> +          aligning bytes.  */
> +L(zero_3):
> +       xorl    %eax, %eax
> +       ret
> +       /* 2-bytes from next cache line.  */
> +END(__memrchr)
>  weak_alias (__memrchr, memrchr)
> --
> 2.34.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
  
Sunil Pandey July 14, 2022, 2:19 a.m. UTC | #2
On Tue, Jun 7, 2022 at 11:07 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code:
> >     1. prioritizes smaller lengths more.
> >     2. optimizes target placement more carefully.
> >     3. reuses logic more.
> >     4. fixes up various inefficiencies in the logic.
> >
> > The total code size saving is: 394 bytes
> > Geometric Mean of all benchmarks New / Old: 0.874
> >
> > Regressions:
> >     1. The page cross case is now colder, especially re-entry from the
> >        page cross case if a match is not found in the first VEC
> >        (roughly 50%). My general opinion with this patch is this is
> >        acceptable given the "coldness" of this case (less than 4%) and
> >        generally performance improvement in the other far more common
> >        cases.
> >
> >     2. There are some regressions 5-15% for medium/large user-arg
> >        lengths that have a match in the first VEC. This is because the
> >        logic was rewritten to optimize finds in the first VEC if the
> >        user-arg length is shorter (where we see roughly 20-50%
> >        performance improvements). It is not always the case this is a
> >        regression. My intuition is some frontend quirk is partially
> >        explaining the data although I haven't been able to find the
> >        root cause.
> >
> > Full xcheck passes on x86_64.
> > ---
> >  sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++--------------------
> >  1 file changed, 292 insertions(+), 321 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
> > index d1a9f47911..b0dffd2ae2 100644
> > --- a/sysdeps/x86_64/memrchr.S
> > +++ b/sysdeps/x86_64/memrchr.S
> > @@ -18,362 +18,333 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #include <sysdep.h>
> > +#define VEC_SIZE                       16
> > +#define PAGE_SIZE                      4096
> >
> >         .text
> > -ENTRY (__memrchr)
> > -       movd    %esi, %xmm1
> > -
> > -       sub     $16, %RDX_LP
> > -       jbe     L(length_less16)
> > -
> > -       punpcklbw       %xmm1, %xmm1
> > -       punpcklbw       %xmm1, %xmm1
> > -
> > -       add     %RDX_LP, %RDI_LP
> > -       pshufd  $0, %xmm1, %xmm1
> > -
> > -       movdqu  (%rdi), %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -
> > -/* Check if there is a match.  */
> > -       pmovmskb        %xmm0, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches0)
> > -
> > -       sub     $64, %rdi
> > -       mov     %edi, %ecx
> > -       and     $15, %ecx
> > -       jz      L(loop_prolog)
> > -
> > -       add     $16, %rdi
> > -       add     $16, %rdx
> > -       and     $-16, %rdi
> > -       sub     %rcx, %rdx
> > -
> > -       .p2align 4
> > -L(loop_prolog):
> > -       sub     $64, %rdx
> > -       jbe     L(exit_loop)
> > -
> > -       movdqa  48(%rdi), %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb        %xmm0, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches48)
> > -
> > -       movdqa  32(%rdi), %xmm2
> > -       pcmpeqb %xmm1, %xmm2
> > -       pmovmskb        %xmm2, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches32)
> > -
> > -       movdqa  16(%rdi), %xmm3
> > -       pcmpeqb %xmm1, %xmm3
> > -       pmovmskb        %xmm3, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches16)
> > -
> > -       movdqa  (%rdi), %xmm4
> > -       pcmpeqb %xmm1, %xmm4
> > -       pmovmskb        %xmm4, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches0)
> > -
> > -       sub     $64, %rdi
> > -       sub     $64, %rdx
> > -       jbe     L(exit_loop)
> > -
> > -       movdqa  48(%rdi), %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb        %xmm0, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches48)
> > -
> > -       movdqa  32(%rdi), %xmm2
> > -       pcmpeqb %xmm1, %xmm2
> > -       pmovmskb        %xmm2, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches32)
> > -
> > -       movdqa  16(%rdi), %xmm3
> > -       pcmpeqb %xmm1, %xmm3
> > -       pmovmskb        %xmm3, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches16)
> > -
> > -       movdqa  (%rdi), %xmm3
> > -       pcmpeqb %xmm1, %xmm3
> > -       pmovmskb        %xmm3, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches0)
> > -
> > -       mov     %edi, %ecx
> > -       and     $63, %ecx
> > -       jz      L(align64_loop)
> > -
> > -       add     $64, %rdi
> > -       add     $64, %rdx
> > -       and     $-64, %rdi
> > -       sub     %rcx, %rdx
> > -
> > -       .p2align 4
> > -L(align64_loop):
> > -       sub     $64, %rdi
> > -       sub     $64, %rdx
> > -       jbe     L(exit_loop)
> > -
> > -       movdqa  (%rdi), %xmm0
> > -       movdqa  16(%rdi), %xmm2
> > -       movdqa  32(%rdi), %xmm3
> > -       movdqa  48(%rdi), %xmm4
> > -
> > -       pcmpeqb %xmm1, %xmm0
> > -       pcmpeqb %xmm1, %xmm2
> > -       pcmpeqb %xmm1, %xmm3
> > -       pcmpeqb %xmm1, %xmm4
> > -
> > -       pmaxub  %xmm3, %xmm0
> > -       pmaxub  %xmm4, %xmm2
> > -       pmaxub  %xmm0, %xmm2
> > -       pmovmskb        %xmm2, %eax
> > -
> > -       test    %eax, %eax
> > -       jz      L(align64_loop)
> > -
> > -       pmovmskb        %xmm4, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches48)
> > -
> > -       pmovmskb        %xmm3, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches32)
> > -
> > -       movdqa  16(%rdi), %xmm2
> > -
> > -       pcmpeqb %xmm1, %xmm2
> > -       pcmpeqb (%rdi), %xmm1
> > -
> > -       pmovmskb        %xmm2, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches16)
> > -
> > -       pmovmskb        %xmm1, %eax
> > -       bsr     %eax, %eax
> > -
> > -       add     %rdi, %rax
> > +ENTRY_P2ALIGN(__memrchr, 6)
> > +#ifdef __ILP32__
> > +       /* Clear upper bits.  */
> > +       mov     %RDX_LP, %RDX_LP
> > +#endif
> > +       movd    %esi, %xmm0
> > +
> > +       /* Get end pointer.  */
> > +       leaq    (%rdx, %rdi), %rcx
> > +
> > +       punpcklbw %xmm0, %xmm0
> > +       punpcklwd %xmm0, %xmm0
> > +       pshufd  $0, %xmm0, %xmm0
> > +
> > +       /* Check if we can load 1x VEC without cross a page.  */
> > +       testl   $(PAGE_SIZE - VEC_SIZE), %ecx
> > +       jz      L(page_cross)
> > +
> > +       /* NB: This load happens regardless of whether rdx (len) is zero. Since
> > +          it doesn't cross a page and the standard gurantees any pointer have
> > +          at least one-valid byte this load must be safe. For the entire
> > +          history of the x86 memrchr implementation this has been possible so
> > +          no code "should" be relying on a zero-length check before this load.
> > +          The zero-length check is moved to the page cross case because it is
> > +          1) pretty cold and including it pushes the hot case len <= VEC_SIZE
> > +          into 2-cache lines.  */
> > +       movups  -(VEC_SIZE)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +
> > +       subq    $VEC_SIZE, %rdx
> > +       ja      L(more_1x_vec)
> > +L(ret_vec_x0_test):
> > +       /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
> > +          zero.  */
> > +       bsrl    %eax, %eax
> > +       jz      L(ret_0)
> > +       /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
> > +          if out of bounds.  */
> > +       addl    %edx, %eax
> > +       jl      L(zero_0)
> > +       /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
> > +          ptr.  */
> > +       addq    %rdi, %rax
> > +L(ret_0):
> >         ret
> >
> > -       .p2align 4
> > -L(exit_loop):
> > -       add     $64, %edx
> > -       cmp     $32, %edx
> > -       jbe     L(exit_loop_32)
> > -
> > -       movdqa  48(%rdi), %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb        %xmm0, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches48)
> > -
> > -       movdqa  32(%rdi), %xmm2
> > -       pcmpeqb %xmm1, %xmm2
> > -       pmovmskb        %xmm2, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches32)
> > -
> > -       movdqa  16(%rdi), %xmm3
> > -       pcmpeqb %xmm1, %xmm3
> > -       pmovmskb        %xmm3, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches16_1)
> > -       cmp     $48, %edx
> > -       jbe     L(return_null)
> > -
> > -       pcmpeqb (%rdi), %xmm1
> > -       pmovmskb        %xmm1, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches0_1)
> > -       xor     %eax, %eax
> > +       .p2align 4,, 5
> > +L(ret_vec_x0):
> > +       bsrl    %eax, %eax
> > +       leaq    -(VEC_SIZE)(%rcx, %rax), %rax
> >         ret
> >
> > -       .p2align 4
> > -L(exit_loop_32):
> > -       movdqa  48(%rdi), %xmm0
> > -       pcmpeqb %xmm1, %xmm0
> > -       pmovmskb        %xmm0, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches48_1)
> > -       cmp     $16, %edx
> > -       jbe     L(return_null)
> > -
> > -       pcmpeqb 32(%rdi), %xmm1
> > -       pmovmskb        %xmm1, %eax
> > -       test    %eax, %eax
> > -       jnz     L(matches32_1)
> > -       xor     %eax, %eax
> > +       .p2align 4,, 2
> > +L(zero_0):
> > +       xorl    %eax, %eax
> >         ret
> >
> > -       .p2align 4
> > -L(matches0):
> > -       bsr     %eax, %eax
> > -       add     %rdi, %rax
> > -       ret
> > -
> > -       .p2align 4
> > -L(matches16):
> > -       bsr     %eax, %eax
> > -       lea     16(%rax, %rdi), %rax
> > -       ret
> >
> > -       .p2align 4
> > -L(matches32):
> > -       bsr     %eax, %eax
> > -       lea     32(%rax, %rdi), %rax
> > +       .p2align 4,, 8
> > +L(more_1x_vec):
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x0)
> > +
> > +       /* Align rcx (pointer to string).  */
> > +       decq    %rcx
> > +       andq    $-VEC_SIZE, %rcx
> > +
> > +       movq    %rcx, %rdx
> > +       /* NB: We could consistenyl save 1-byte in this pattern with `movaps
> > +          %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
> > +          it adds more frontend uops (even if the moves can be eliminated) and
> > +          some percentage of the time actual backend uops.  */
> > +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       subq    %rdi, %rdx
> > +       pmovmskb %xmm1, %eax
> > +
> > +       cmpq    $(VEC_SIZE * 2), %rdx
> > +       ja      L(more_2x_vec)
> > +L(last_2x_vec):
> > +       subl    $VEC_SIZE, %edx
> > +       jbe     L(ret_vec_x0_test)
> > +
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x0)
> > +
> > +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +
> > +       subl    $VEC_SIZE, %edx
> > +       bsrl    %eax, %eax
> > +       jz      L(ret_1)
> > +       addl    %edx, %eax
> > +       jl      L(zero_0)
> > +       addq    %rdi, %rax
> > +L(ret_1):
> >         ret
> >
> > -       .p2align 4
> > -L(matches48):
> > -       bsr     %eax, %eax
> > -       lea     48(%rax, %rdi), %rax
> > +       /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
> > +          causes the hot pause (length <= VEC_SIZE) to span multiple cache
> > +          lines.  Naturally aligned % 16 to 8-bytes.  */
> > +L(page_cross):
> > +       /* Zero length check.  */
> > +       testq   %rdx, %rdx
> > +       jz      L(zero_0)
> > +
> > +       leaq    -1(%rcx), %r8
> > +       andq    $-(VEC_SIZE), %r8
> > +
> > +       movaps  (%r8), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %esi
> > +       /* Shift out negative alignment (because we are starting from endptr and
> > +          working backwards).  */
> > +       negl    %ecx
> > +       /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
> > +          explicitly.  */
> > +       andl    $(VEC_SIZE - 1), %ecx
> > +       shl     %cl, %esi
> > +       movzwl  %si, %eax
> > +       leaq    (%rdi, %rdx), %rcx
> > +       cmpq    %rdi, %r8
> > +       ja      L(more_1x_vec)
> > +       subl    $VEC_SIZE, %edx
> > +       bsrl    %eax, %eax
> > +       jz      L(ret_2)
> > +       addl    %edx, %eax
> > +       jl      L(zero_1)
> > +       addq    %rdi, %rax
> > +L(ret_2):
> >         ret
> >
> > -       .p2align 4
> > -L(matches0_1):
> > -       bsr     %eax, %eax
> > -       sub     $64, %rdx
> > -       add     %rax, %rdx
> > -       jl      L(return_null)
> > -       add     %rdi, %rax
> > +       /* Fits in aliging bytes.  */
> > +L(zero_1):
> > +       xorl    %eax, %eax
> >         ret
> >
> > -       .p2align 4
> > -L(matches16_1):
> > -       bsr     %eax, %eax
> > -       sub     $48, %rdx
> > -       add     %rax, %rdx
> > -       jl      L(return_null)
> > -       lea     16(%rdi, %rax), %rax
> > +       .p2align 4,, 5
> > +L(ret_vec_x1):
> > +       bsrl    %eax, %eax
> > +       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
> >         ret
> >
> > -       .p2align 4
> > -L(matches32_1):
> > -       bsr     %eax, %eax
> > -       sub     $32, %rdx
> > -       add     %rax, %rdx
> > -       jl      L(return_null)
> > -       lea     32(%rdi, %rax), %rax
> > -       ret
> > +       .p2align 4,, 8
> > +L(more_2x_vec):
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x0)
> >
> > -       .p2align 4
> > -L(matches48_1):
> > -       bsr     %eax, %eax
> > -       sub     $16, %rdx
> > -       add     %rax, %rdx
> > -       jl      L(return_null)
> > -       lea     48(%rdi, %rax), %rax
> > -       ret
> > +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x1)
> >
> > -       .p2align 4
> > -L(return_null):
> > -       xor     %eax, %eax
> > -       ret
> >
> > -       .p2align 4
> > -L(length_less16_offset0):
> > -       test    %edx, %edx
> > -       jz      L(return_null)
> > +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> >
> > -       mov     %dl, %cl
> > -       pcmpeqb (%rdi), %xmm1
> > +       subq    $(VEC_SIZE * 4), %rdx
> > +       ja      L(more_4x_vec)
> >
> > -       mov     $1, %edx
> > -       sal     %cl, %edx
> > -       sub     $1, %edx
> > +       addl    $(VEC_SIZE), %edx
> > +       jle     L(ret_vec_x2_test)
> >
> > -       pmovmskb        %xmm1, %eax
> > +L(last_vec):
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x2)
> >
> > -       and     %edx, %eax
> > -       test    %eax, %eax
> > -       jz      L(return_null)
> > +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> >
> > -       bsr     %eax, %eax
> > -       add     %rdi, %rax
> > +       subl    $(VEC_SIZE), %edx
> > +       bsrl    %eax, %eax
> > +       jz      L(ret_3)
> > +       addl    %edx, %eax
> > +       jl      L(zero_2)
> > +       addq    %rdi, %rax
> > +L(ret_3):
> >         ret
> >
> > -       .p2align 4
> > -L(length_less16):
> > -       punpcklbw       %xmm1, %xmm1
> > -       punpcklbw       %xmm1, %xmm1
> > -
> > -       add     $16, %edx
> > -
> > -       pshufd  $0, %xmm1, %xmm1
> > -
> > -       mov     %edi, %ecx
> > -       and     $15, %ecx
> > -       jz      L(length_less16_offset0)
> > -
> > -       mov     %cl, %dh
> > -       mov     %ecx, %esi
> > -       add     %dl, %dh
> > -       and     $-16, %rdi
> > -
> > -       sub     $16, %dh
> > -       ja      L(length_less16_part2)
> > -
> > -       pcmpeqb (%rdi), %xmm1
> > -       pmovmskb        %xmm1, %eax
> > -
> > -       sar     %cl, %eax
> > -       mov     %dl, %cl
> > -
> > -       mov     $1, %edx
> > -       sal     %cl, %edx
> > -       sub     $1, %edx
> > -
> > -       and     %edx, %eax
> > -       test    %eax, %eax
> > -       jz      L(return_null)
> > -
> > -       bsr     %eax, %eax
> > -       add     %rdi, %rax
> > -       add     %rsi, %rax
> > +       .p2align 4,, 6
> > +L(ret_vec_x2_test):
> > +       bsrl    %eax, %eax
> > +       jz      L(zero_2)
> > +       addl    %edx, %eax
> > +       jl      L(zero_2)
> > +       addq    %rdi, %rax
> >         ret
> >
> > -       .p2align 4
> > -L(length_less16_part2):
> > -       movdqa  16(%rdi), %xmm2
> > -       pcmpeqb %xmm1, %xmm2
> > -       pmovmskb        %xmm2, %eax
> > -
> > -       mov     %dh, %cl
> > -       mov     $1, %edx
> > -       sal     %cl, %edx
> > -       sub     $1, %edx
> > -
> > -       and     %edx, %eax
> > +L(zero_2):
> > +       xorl    %eax, %eax
> > +       ret
> >
> > -       test    %eax, %eax
> > -       jnz     L(length_less16_part2_return)
> >
> > -       pcmpeqb (%rdi), %xmm1
> > -       pmovmskb        %xmm1, %eax
> > +       .p2align 4,, 5
> > +L(ret_vec_x2):
> > +       bsrl    %eax, %eax
> > +       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> > +       ret
> >
> > -       mov     %esi, %ecx
> > -       sar     %cl, %eax
> > -       test    %eax, %eax
> > -       jz      L(return_null)
> > +       .p2align 4,, 5
> > +L(ret_vec_x3):
> > +       bsrl    %eax, %eax
> > +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> > +       ret
> >
> > -       bsr     %eax, %eax
> > -       add     %rdi, %rax
> > -       add     %rsi, %rax
> > +       .p2align 4,, 8
> > +L(more_4x_vec):
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x2)
> > +
> > +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x3)
> > +
> > +       addq    $-(VEC_SIZE * 4), %rcx
> > +       cmpq    $(VEC_SIZE * 4), %rdx
> > +       jbe     L(last_4x_vec)
> > +
> > +       /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
> > +          keeping the code from spilling to the next cache line.  */
> > +       addq    $(VEC_SIZE * 4 - 1), %rcx
> > +       andq    $-(VEC_SIZE * 4), %rcx
> > +       leaq    (VEC_SIZE * 4)(%rdi), %rdx
> > +       andq    $-(VEC_SIZE * 4), %rdx
> > +
> > +       .p2align 4,, 11
> > +L(loop_4x_vec):
> > +       movaps  (VEC_SIZE * -1)(%rcx), %xmm1
> > +       movaps  (VEC_SIZE * -2)(%rcx), %xmm2
> > +       movaps  (VEC_SIZE * -3)(%rcx), %xmm3
> > +       movaps  (VEC_SIZE * -4)(%rcx), %xmm4
> > +       pcmpeqb %xmm0, %xmm1
> > +       pcmpeqb %xmm0, %xmm2
> > +       pcmpeqb %xmm0, %xmm3
> > +       pcmpeqb %xmm0, %xmm4
> > +
> > +       por     %xmm1, %xmm2
> > +       por     %xmm3, %xmm4
> > +       por     %xmm2, %xmm4
> > +
> > +       pmovmskb %xmm4, %esi
> > +       testl   %esi, %esi
> > +       jnz     L(loop_end)
> > +
> > +       addq    $-(VEC_SIZE * 4), %rcx
> > +       cmpq    %rdx, %rcx
> > +       jne     L(loop_4x_vec)
> > +
> > +       subl    %edi, %edx
> > +
> > +       /* Ends up being 1-byte nop.  */
> > +       .p2align 4,, 2
> > +L(last_4x_vec):
> > +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +
> > +       cmpl    $(VEC_SIZE * 2), %edx
> > +       jbe     L(last_2x_vec)
> > +
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_x0)
> > +
> > +
> > +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_end)
> > +
> > +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> > +       pcmpeqb %xmm0, %xmm1
> > +       pmovmskb %xmm1, %eax
> > +
> > +       subl    $(VEC_SIZE * 3), %edx
> > +       ja      L(last_vec)
> > +       bsrl    %eax, %eax
> > +       jz      L(ret_4)
> > +       addl    %edx, %eax
> > +       jl      L(zero_3)
> > +       addq    %rdi, %rax
> > +L(ret_4):
> >         ret
> >
> > -       .p2align 4
> > -L(length_less16_part2_return):
> > -       bsr     %eax, %eax
> > -       lea     16(%rax, %rdi), %rax
> > +       /* Ends up being 1-byte nop.  */
> > +       .p2align 4,, 3
> > +L(loop_end):
> > +       pmovmskb %xmm1, %eax
> > +       sall    $16, %eax
> > +       jnz     L(ret_vec_end)
> > +
> > +       pmovmskb %xmm2, %eax
> > +       testl   %eax, %eax
> > +       jnz     L(ret_vec_end)
> > +
> > +       pmovmskb %xmm3, %eax
> > +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> > +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> > +          then CHAR in VEC3 and bsrq will use that position.  */
> > +       sall    $16, %eax
> > +       orl     %esi, %eax
> > +       bsrl    %eax, %eax
> > +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> >         ret
> >
> > -END (__memrchr)
> > +L(ret_vec_end):
> > +       bsrl    %eax, %eax
> > +       leaq    (VEC_SIZE * -2)(%rax, %rcx), %rax
> > +       ret
> > +       /* Use in L(last_4x_vec). In the same cache line. This is just a spare
> > +          aligning bytes.  */
> > +L(zero_3):
> > +       xorl    %eax, %eax
> > +       ret
> > +       /* 2-bytes from next cache line.  */
> > +END(__memrchr)
> >  weak_alias (__memrchr, memrchr)
> > --
> > 2.34.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
  

Patch

diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index d1a9f47911..b0dffd2ae2 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -18,362 +18,333 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#define VEC_SIZE			16
+#define PAGE_SIZE			4096
 
 	.text
-ENTRY (__memrchr)
-	movd	%esi, %xmm1
-
-	sub	$16, %RDX_LP
-	jbe	L(length_less16)
-
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	%RDX_LP, %RDI_LP
-	pshufd	$0, %xmm1, %xmm1
-
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-
-/* Check if there is a match.  */
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(loop_prolog)
-
-	add	$16, %rdi
-	add	$16, %rdx
-	and	$-16, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(loop_prolog):
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqb	%xmm1, %xmm4
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	mov	%edi, %ecx
-	and	$63, %ecx
-	jz	L(align64_loop)
-
-	add	$64, %rdi
-	add	$64, %rdx
-	and	$-64, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(align64_loop):
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	movdqa	48(%rdi), %xmm4
-
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm1, %xmm4
-
-	pmaxub	%xmm3, %xmm0
-	pmaxub	%xmm4, %xmm2
-	pmaxub	%xmm0, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	test	%eax, %eax
-	jz	L(align64_loop)
-
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm2
-
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	(%rdi), %xmm1
-
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	pmovmskb	%xmm1, %eax
-	bsr	%eax, %eax
-
-	add	%rdi, %rax
+ENTRY_P2ALIGN(__memrchr, 6)
+#ifdef __ILP32__
+	/* Clear upper bits.  */
+	mov	%RDX_LP, %RDX_LP
+#endif
+	movd	%esi, %xmm0
+
+	/* Get end pointer.  */
+	leaq	(%rdx, %rdi), %rcx
+
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	jz	L(page_cross)
+
+	/* NB: This load happens regardless of whether rdx (len) is zero. Since
+	   it doesn't cross a page and the standard gurantees any pointer have
+	   at least one-valid byte this load must be safe. For the entire
+	   history of the x86 memrchr implementation this has been possible so
+	   no code "should" be relying on a zero-length check before this load.
+	   The zero-length check is moved to the page cross case because it is
+	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+	   into 2-cache lines.  */
+	movups	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+	   zero.  */
+	bsrl	%eax, %eax
+	jz	L(ret_0)
+	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+	   if out of bounds.  */
+	addl	%edx, %eax
+	jl	L(zero_0)
+	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+	   ptr.  */
+	addq	%rdi, %rax
+L(ret_0):
 	ret
 
-	.p2align 4
-L(exit_loop):
-	add	$64, %edx
-	cmp	$32, %edx
-	jbe	L(exit_loop_32)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16_1)
-	cmp	$48, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches0_1)
-	xor	%eax, %eax
+	.p2align 4,, 5
+L(ret_vec_x0):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(exit_loop_32):
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48_1)
-	cmp	$16, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	32(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches32_1)
-	xor	%eax, %eax
+	.p2align 4,, 2
+L(zero_0):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches0):
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
-	ret
 
-	.p2align 4
-L(matches32):
-	bsr	%eax, %eax
-	lea	32(%rax, %rdi), %rax
+	.p2align 4,, 8
+L(more_1x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	/* Align rcx (pointer to string).  */
+	decq	%rcx
+	andq	$-VEC_SIZE, %rcx
+
+	movq	%rcx, %rdx
+	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
+	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+	   it adds more frontend uops (even if the moves can be eliminated) and
+	   some percentage of the time actual backend uops.  */
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	subq	%rdi, %rdx
+	pmovmskb %xmm1, %eax
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	subl	$VEC_SIZE, %edx
+	jbe	L(ret_vec_x0_test)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_1)
+	addl	%edx, %eax
+	jl	L(zero_0)
+	addq	%rdi, %rax
+L(ret_1):
 	ret
 
-	.p2align 4
-L(matches48):
-	bsr	%eax, %eax
-	lea	48(%rax, %rdi), %rax
+	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
+	   lines.  Naturally aligned % 16 to 8-bytes.  */
+L(page_cross):
+	/* Zero length check.  */
+	testq	%rdx, %rdx
+	jz	L(zero_0)
+
+	leaq	-1(%rcx), %r8
+	andq	$-(VEC_SIZE), %r8
+
+	movaps	(%r8), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	negl	%ecx
+	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+	   explicitly.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	shl	%cl, %esi
+	movzwl	%si, %eax
+	leaq	(%rdi, %rdx), %rcx
+	cmpq	%rdi, %r8
+	ja	L(more_1x_vec)
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_2)
+	addl	%edx, %eax
+	jl	L(zero_1)
+	addq	%rdi, %rax
+L(ret_2):
 	ret
 
-	.p2align 4
-L(matches0_1):
-	bsr	%eax, %eax
-	sub	$64, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	add	%rdi, %rax
+	/* Fits in aliging bytes.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches16_1):
-	bsr	%eax, %eax
-	sub	$48, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	16(%rdi, %rax), %rax
+	.p2align 4,, 5
+L(ret_vec_x1):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(matches32_1):
-	bsr	%eax, %eax
-	sub	$32, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	32(%rdi, %rax), %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(matches48_1):
-	bsr	%eax, %eax
-	sub	$16, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	48(%rdi, %rax), %rax
-	ret
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(return_null):
-	xor	%eax, %eax
-	ret
 
-	.p2align 4
-L(length_less16_offset0):
-	test	%edx, %edx
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	mov	%dl, %cl
-	pcmpeqb	(%rdi), %xmm1
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
 
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
+	addl	$(VEC_SIZE), %edx
+	jle	L(ret_vec_x2_test)
 
-	pmovmskb	%xmm1, %eax
+L(last_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
 
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
+	subl	$(VEC_SIZE), %edx
+	bsrl	%eax, %eax
+	jz	L(ret_3)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+L(ret_3):
 	ret
 
-	.p2align 4
-L(length_less16):
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	$16, %edx
-
-	pshufd	$0, %xmm1, %xmm1
-
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(length_less16_offset0)
-
-	mov	%cl, %dh
-	mov	%ecx, %esi
-	add	%dl, %dh
-	and	$-16, %rdi
-
-	sub	$16, %dh
-	ja	L(length_less16_part2)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-
-	sar	%cl, %eax
-	mov	%dl, %cl
-
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
-
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 6
+L(ret_vec_x2_test):
+	bsrl	%eax, %eax
+	jz	L(zero_2)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4
-L(length_less16_part2):
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	mov	%dh, %cl
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
+L(zero_2):
+	xorl	%eax, %eax
+	ret
 
-	test	%eax, %eax
-	jnz	L(length_less16_part2_return)
 
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
+	.p2align 4,, 5
+L(ret_vec_x2):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	mov	%esi, %ecx
-	sar	%cl, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	.p2align 4,, 5
+L(ret_vec_x3):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x3)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+	   keeping the code from spilling to the next cache line.  */
+	addq	$(VEC_SIZE * 4 - 1), %rcx
+	andq	$-(VEC_SIZE * 4), %rcx
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
+	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
+	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
+	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
+	pcmpeqb	%xmm0, %xmm1
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm0, %xmm4
+
+	por	%xmm1, %xmm2
+	por	%xmm3, %xmm4
+	por	%xmm2, %xmm4
+
+	pmovmskb %xmm4, %esi
+	testl	%esi, %esi
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	%rdx, %rcx
+	jne	L(loop_4x_vec)
+
+	subl	%edi, %edx
+
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 2
+L(last_4x_vec):
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+	bsrl	%eax, %eax
+	jz	L(ret_4)
+	addl	%edx, %eax
+	jl	L(zero_3)
+	addq	%rdi, %rax
+L(ret_4):
 	ret
 
-	.p2align 4
-L(length_less16_part2_return):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 3
+L(loop_end):
+	pmovmskb %xmm1, %eax
+	sall	$16, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm2, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm3, %eax
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	sall	$16, %eax
+	orl	%esi, %eax
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
 	ret
 
-END (__memrchr)
+L(ret_vec_end):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
+	ret
+	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
+	   aligning bytes.  */
+L(zero_3):
+	xorl	%eax, %eax
+	ret
+	/* 2-bytes from next cache line.  */
+END(__memrchr)
 weak_alias (__memrchr, memrchr)