[v1,2/3] x86: Optimize memchr-avx2.S
Commit Message
No bug. This commit optimizes memchr-avx2.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
asaving a few instructions the in loop return loop. test-memchr,
test-rawmemchr, and test-wmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-avx2.S | 446 +++++++++++++++----------
1 file changed, 262 insertions(+), 184 deletions(-)
Comments
On Mon, May 03, 2021 at 04:44:36AM -0400, Noah Goldstein wrote:
> No bug. This commit optimizes memchr-avx2.S. The optimizations include
> replacing some branches with cmovcc, avoiding some branches entirely
> in the less_4x_vec case, making the page cross logic less strict,
> asaving a few instructions the in loop return loop. test-memchr,
> test-rawmemchr, and test-wmemchr are all passing.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> sysdeps/x86_64/multiarch/memchr-avx2.S | 446 +++++++++++++++----------
> 1 file changed, 262 insertions(+), 184 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 1fcb1c350f..8368fcd1e1 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -26,8 +26,22 @@
>
> # ifdef USE_AS_WMEMCHR
> # define VPCMPEQ vpcmpeqd
> +# define VPBROADCAST vpbroadcastd
> +# define CHAR_SIZE 4
> # else
> # define VPCMPEQ vpcmpeqb
> +# define VPBROADCAST vpbroadcastb
> +# define CHAR_SIZE 1
> +# endif
> +
> +# ifdef USE_AS_RAWMEMCHR
> +# define ERAW_PTR_REG ecx
> +# define RRAW_PTR_REG rcx
> +# define ALGN_PTR_REG rdi
> +# else
> +# define ERAW_PTR_REG edi
> +# define RRAW_PTR_REG rdi
> +# define ALGN_PTR_REG rcx
> # endif
>
> # ifndef VZEROUPPER
> @@ -39,303 +53,367 @@
> # endif
>
> # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
> +
Remove the extra line here.
>
> .section SECTION(.text),"ax",@progbits
> -ENTRY (MEMCHR)
> +ENTRY(MEMCHR)
No need for this change.
> # ifndef USE_AS_RAWMEMCHR
> /* Check for zero length. */
> test %RDX_LP, %RDX_LP
> jz L(null)
> # endif
> - movl %edi, %ecx
> - /* Broadcast CHAR to YMM0. */
> - vmovd %esi, %xmm0
> # ifdef USE_AS_WMEMCHR
> shl $2, %RDX_LP
> - vpbroadcastd %xmm0, %ymm0
> # else
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> movl %edx, %edx
> # endif
> - vpbroadcastb %xmm0, %ymm0
> # endif
> - /* Check if we may cross page boundary with one vector load. */
> - andl $(2 * VEC_SIZE - 1), %ecx
> - cmpl $VEC_SIZE, %ecx
> - ja L(cros_page_boundary)
> + /* Broadcast CHAR to YMMMATCH. */
> + vmovd %esi, %xmm0
> + VPBROADCAST %xmm0, %ymm0
> + /* Check if we may cross page boundary with one
> + vector load. */
> + movl %edi, %eax
> + andl $(PAGE_SIZE - 1), %eax
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + ja L(cross_page_boundary)
>
> /* Check the first VEC_SIZE bytes. */
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> + VPCMPEQ (%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> -
> # ifndef USE_AS_RAWMEMCHR
> - jnz L(first_vec_x0_check)
> - /* Adjust length and check the end of data. */
> - subq $VEC_SIZE, %rdx
> - jbe L(zero)
> -# else
> - jnz L(first_vec_x0)
> + /* If length < CHAR_PER_VEC handle special. */
> + cmpq $VEC_SIZE, %rdx
> + jbe L(first_vec_x0)
> # endif
> -
> - /* Align data for aligned loads in the loop. */
> - addq $VEC_SIZE, %rdi
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> + testl %eax, %eax
> + jz L(aligned_more)
> + tzcntl %eax, %eax
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> # ifndef USE_AS_RAWMEMCHR
> - /* Adjust length. */
> - addq %rcx, %rdx
> -
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> + .p2align 5
> +L(first_vec_x0):
> + /* Check if first match was before length. */
> + tzcntl %eax, %eax
> + xorl %ecx, %ecx
> + cmpl %eax, %edx
> + leaq (%rdi, %rax), %rax
> + cmovle %rcx, %rax
> + VZEROUPPER_RETURN
> +L(null):
> + xorl %eax, %eax
> + ret
> # endif
> - jmp L(more_4x_vec)
> -
> .p2align 4
> -L(cros_page_boundary):
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> +L(cross_page_boundary):
> + /* Save pointer before aligning as its original
> + value is necessary for computer return address if byte is
> + found or adjusting length if it is not and this is
Fit comments to 72 columns.
> + memchr. */
> + movq %rdi, %rcx
> + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is
> + rcx for memchr and rdi for rawmemchr. */
> + orq $(VEC_SIZE - 1), %ALGN_PTR_REG
> + VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> + /* Calculate length until end of page (length
> + checked for a match). */
> + leaq 1(%ALGN_PTR_REG), %rsi
> + subq %RRAW_PTR_REG, %rsi
> +# endif
> /* Remove the leading bytes. */
> - sarl %cl, %eax
> - testl %eax, %eax
> - jz L(aligned_more)
> - tzcntl %eax, %eax
> + sarxl %ERAW_PTR_REG, %eax, %eax
> # ifndef USE_AS_RAWMEMCHR
> /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> + cmpq %rsi, %rdx
> + jbe L(first_vec_x0)
> # endif
> - addq %rdi, %rax
> - addq %rcx, %rax
> + testl %eax, %eax
> + jz L(cross_page_continue)
> + tzcntl %eax, %eax
> + addq %RRAW_PTR_REG, %rax
> L(return_vzeroupper):
> ZERO_UPPER_VEC_REGISTERS_RETURN
>
> .p2align 4
> -L(aligned_more):
> -# ifndef USE_AS_RAWMEMCHR
> - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> - overflow. */
> - negq %rcx
> - addq $VEC_SIZE, %rcx
> +L(first_vec_x1):
> + tzcntl %eax, %eax
> + incq %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> - /* Check the end of data. */
> - subq %rcx, %rdx
> - jbe L(zero)
> -# endif
> + .p2align 4
> +L(first_vec_x2):
> + tzcntl %eax, %eax
> + addq $(VEC_SIZE + 1), %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> - addq $VEC_SIZE, %rdi
> + .p2align 4
> +L(first_vec_x3):
> + tzcntl %eax, %eax
> + addq $(VEC_SIZE * 2 + 1), %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> -# ifndef USE_AS_RAWMEMCHR
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
>
> -L(more_4x_vec):
> - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> - since data is only aligned to VEC_SIZE. */
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> + .p2align 4
> +L(first_vec_x4):
> + tzcntl %eax, %eax
> + addq $(VEC_SIZE * 3 + 1), %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> + .p2align 4
> +L(aligned_more):
> + /* Check the first 4 * VEC_SIZE. Only one
> + VEC_SIZE at a time since data is only aligned to
> + VEC_SIZE. */
Fit comments to 72 columns.
> +
> +# ifndef USE_AS_RAWMEMCHR
> +L(cross_page_continue):
> + /* Align data to VEC_SIZE - 1. */
> + xorl %ecx, %ecx
> + subl %edi, %ecx
> + orq $(VEC_SIZE - 1), %rdi
> + /* esi is for adjusting length to see if near the
> + end. */
Fit comments to 72 columns.
> + leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> +# else
> + orq $(VEC_SIZE - 1), %rdi
> +L(cross_page_continue):
> +# endif
> + /* Load first VEC regardless. */
> + VPCMPEQ 1(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> + /* Adjust length. If near end handle specially.
> + */
Put the comments on one line.
> + subq %rsi, %rdx
> + jbe L(last_4x_vec_or_less)
> +# endif
> testl %eax, %eax
> jnz L(first_vec_x1)
>
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x2)
>
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x3)
>
> - addq $(VEC_SIZE * 4), %rdi
> + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + testl %eax, %eax
> + jnz L(first_vec_x4)
>
> # ifndef USE_AS_RAWMEMCHR
> + /* Check if at last VEC_SIZE * 4 length. */
> subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
> -
> - /* Align data to 4 * VEC_SIZE. */
> - movq %rdi, %rcx
> - andl $(4 * VEC_SIZE - 1), %ecx
> - andq $-(4 * VEC_SIZE), %rdi
> -
> -# ifndef USE_AS_RAWMEMCHR
> - /* Adjust length. */
> + jbe L(last_4x_vec_or_less_cmpeq)
> + /* Align data to VEC_SIZE * 4 - 1 for the loop
> + and readjust length. */
> + incq %rdi
> + movl %edi, %ecx
> + orq $(VEC_SIZE * 4 - 1), %rdi
> + andl $(VEC_SIZE * 4 - 1), %ecx
> addq %rcx, %rdx
> +# else
> + /* Align data to VEC_SIZE * 4 - 1 for loop. */
> + incq %rdi
> + orq $(VEC_SIZE * 4 - 1), %rdi
> # endif
>
> + /* Compare 4 * VEC at a time forward. */
> .p2align 4
> L(loop_4x_vec):
> - /* Compare 4 * VEC at a time forward. */
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> -
> + VPCMPEQ 1(%rdi), %ymm0, %ymm1
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
> + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
> + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
> vpor %ymm1, %ymm2, %ymm5
> vpor %ymm3, %ymm4, %ymm6
> vpor %ymm5, %ymm6, %ymm5
>
> - vpmovmskb %ymm5, %eax
> - testl %eax, %eax
> - jnz L(4x_vec_end)
> -
> - addq $(VEC_SIZE * 4), %rdi
> -
> + vpmovmskb %ymm5, %ecx
> # ifdef USE_AS_RAWMEMCHR
> - jmp L(loop_4x_vec)
> + subq $-(VEC_SIZE * 4), %rdi
> + testl %ecx, %ecx
> + jz L(loop_4x_vec)
> # else
> - subq $(VEC_SIZE * 4), %rdx
> - ja L(loop_4x_vec)
> + testl %ecx, %ecx
> + jnz L(loop_4x_vec_end)
>
> -L(last_4x_vec_or_less):
> - /* Less than 4 * VEC and aligned to VEC_SIZE. */
> - addl $(VEC_SIZE * 2), %edx
> - jle L(last_2x_vec)
> + subq $-(VEC_SIZE * 4), %rdi
>
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> + subq $(VEC_SIZE * 4), %rdx
> + ja L(loop_4x_vec)
>
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> + /* Fall through into less than 4 remaining
> + vectors of length case. */
Fit comments to 72 columns.
> + VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> + .p2align 4
> +L(last_4x_vec_or_less):
> + /* Check if first VEC contained match. */
> testl %eax, %eax
> - jnz L(first_vec_x1)
> + jnz L(first_vec_x1_check)
>
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> + /* If remaining length > VEC_SIZE * 2. */
> + addl $(VEC_SIZE * 2), %edx
> + jg L(last_4x_vec)
>
> - jnz L(first_vec_x2_check)
> - subl $VEC_SIZE, %edx
> - jle L(zero)
> +L(last_2x_vec):
> + /* If remaining length < VEC_SIZE. */
> + addl $VEC_SIZE, %edx
> + jle L(zero_end)
>
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> + /* Check VEC2 and compare any match with
> + remaining length. */
Fit comments to 72 columns.
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> -
> - jnz L(first_vec_x3_check)
> - xorl %eax, %eax
> + tzcntl %eax, %eax
> + cmpl %eax, %edx
> + jbe L(set_zero_end)
> + addq $(VEC_SIZE + 1), %rdi
> + addq %rdi, %rax
> +L(zero_end):
> VZEROUPPER_RETURN
>
> .p2align 4
> -L(last_2x_vec):
> - addl $(VEC_SIZE * 2), %edx
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> +L(loop_4x_vec_end):
> +# endif
> + /* rawmemchr will fall through into this if match
> + was found in loop. */
Fit comments to 72 columns.
> +
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> + jnz L(last_vec_x1_return)
>
> - jnz L(first_vec_x0_check)
> - subl $VEC_SIZE, %edx
> - jle L(zero)
> -
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm2, %eax
> testl %eax, %eax
> - jnz L(first_vec_x1_check)
> - xorl %eax, %eax
> - VZEROUPPER_RETURN
> + jnz L(last_vec_x2_return)
>
> - .p2align 4
> -L(first_vec_x0_check):
> - tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> + vpmovmskb %ymm3, %eax
> + /* Combine VEC3 matches (eax) with VEC4 matches
> + (ecx). */
Fit comments to 72 columns.
> + salq $32, %rcx
> + orq %rcx, %rax
> + tzcntq %rax, %rax
> +# ifdef USE_AS_RAWMEMCHR
> + subq $(VEC_SIZE * 2 - 1), %rdi
> +# else
> + subq $-(VEC_SIZE * 2 + 1), %rdi
> +# endif
> addq %rdi, %rax
> VZEROUPPER_RETURN
> +# ifndef USE_AS_RAWMEMCHR
>
> .p2align 4
> L(first_vec_x1_check):
> tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $VEC_SIZE, %rax
> + /* Adjust length. */
> + subl $-(VEC_SIZE * 4), %edx
> + /* Check if match within remaining length. */
> + cmpl %eax, %edx
> + jbe L(set_zero_end)
> + incq %rdi
> addq %rdi, %rax
> VZEROUPPER_RETURN
> + .p2align 4
> +L(set_zero_end):
> + xorl %eax, %eax
> + VZEROUPPER_RETURN
> +# endif
>
> .p2align 4
> -L(first_vec_x2_check):
> +L(last_vec_x1_return):
> tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $(VEC_SIZE * 2), %rax
> +# ifdef USE_AS_RAWMEMCHR
> + subq $(VEC_SIZE * 4 - 1), %rdi
> +# else
> + incq %rdi
> +# endif
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> .p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x2_return):
> tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $(VEC_SIZE * 3), %rax
> +# ifdef USE_AS_RAWMEMCHR
> + subq $(VEC_SIZE * 3 - 1), %rdi
> +# else
> + subq $-(VEC_SIZE + 1), %rdi
> +# endif
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> +# ifndef USE_AS_RAWMEMCHR
> .p2align 4
> -L(zero):
> - xorl %eax, %eax
> - jmp L(return_vzeroupper)
> +L(last_4x_vec_or_less_cmpeq):
> + VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + subq $-(VEC_SIZE * 4), %rdi
> + /* Check first VEC regardless. */
> + testl %eax, %eax
> + jnz L(first_vec_x1_check)
>
> + /* If remaining length <= CHAR_PER_VEC * 2. */
> + addl $(VEC_SIZE * 2), %edx
> + jle L(last_2x_vec)
> .p2align 4
> -L(null):
> - xorl %eax, %eax
> - ret
> -# endif
> +L(last_4x_vec):
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + testl %eax, %eax
> + jnz L(last_vec_x2_return)
>
> - .p2align 4
> -L(first_vec_x0):
> - tzcntl %eax, %eax
> - addq %rdi, %rax
> - VZEROUPPER_RETURN
> + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
>
> - .p2align 4
> -L(first_vec_x1):
> - tzcntl %eax, %eax
> - addq $VEC_SIZE, %rax
> - addq %rdi, %rax
> - VZEROUPPER_RETURN
> + /* Create mask for possible matches within
> + remaining length. */
Fit comments to 72 columns.
> + movq $-1, %rcx
> + bzhiq %rdx, %rcx, %rcx
>
> - .p2align 4
> -L(first_vec_x2):
> + /* Test matches in data against length match. */
> + andl %ecx, %eax
> + jnz L(last_vec_x3)
> +
> + /* if remaining length <= VEC_SIZE * 3 (Note this
> + is after remaining length was found to be > VEC_SIZE * 2.
Fit comments to 72 columns.
> + */
> + subl $VEC_SIZE, %edx
> + jbe L(zero_end2)
> +
> + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + /* Shift remaining length mask for last VEC. */
> + shrq $32, %rcx
> + andl %ecx, %eax
> + jz L(zero_end2)
> tzcntl %eax, %eax
> - addq $(VEC_SIZE * 2), %rax
> + addq $(VEC_SIZE * 3 + 1), %rdi
> addq %rdi, %rax
> +L(zero_end2):
> VZEROUPPER_RETURN
>
> .p2align 4
> -L(4x_vec_end):
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> - vpmovmskb %ymm2, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x1)
> - vpmovmskb %ymm3, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x2)
> - vpmovmskb %ymm4, %eax
> - testl %eax, %eax
> -L(first_vec_x3):
> +L(last_vec_x3):
> tzcntl %eax, %eax
> - addq $(VEC_SIZE * 3), %rax
> + subq $-(VEC_SIZE * 2 + 1), %rdi
> addq %rdi, %rax
> VZEROUPPER_RETURN
> +# endif
>
> -END (MEMCHR)
> +END(MEMCHR)
No need for this change.
> #endif
> --
> 2.29.2
>
Thanks.
H.J.
On Mon, May 3, 2021 at 2:50 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 03, 2021 at 04:44:36AM -0400, Noah Goldstein wrote:
> > No bug. This commit optimizes memchr-avx2.S. The optimizations include
> > replacing some branches with cmovcc, avoiding some branches entirely
> > in the less_4x_vec case, making the page cross logic less strict,
> > asaving a few instructions the in loop return loop. test-memchr,
> > test-rawmemchr, and test-wmemchr are all passing.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > sysdeps/x86_64/multiarch/memchr-avx2.S | 446 +++++++++++++++----------
> > 1 file changed, 262 insertions(+), 184 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > index 1fcb1c350f..8368fcd1e1 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > @@ -26,8 +26,22 @@
> >
> > # ifdef USE_AS_WMEMCHR
> > # define VPCMPEQ vpcmpeqd
> > +# define VPBROADCAST vpbroadcastd
> > +# define CHAR_SIZE 4
> > # else
> > # define VPCMPEQ vpcmpeqb
> > +# define VPBROADCAST vpbroadcastb
> > +# define CHAR_SIZE 1
> > +# endif
> > +
> > +# ifdef USE_AS_RAWMEMCHR
> > +# define ERAW_PTR_REG ecx
> > +# define RRAW_PTR_REG rcx
> > +# define ALGN_PTR_REG rdi
> > +# else
> > +# define ERAW_PTR_REG edi
> > +# define RRAW_PTR_REG rdi
> > +# define ALGN_PTR_REG rcx
> > # endif
> >
> > # ifndef VZEROUPPER
> > @@ -39,303 +53,367 @@
> > # endif
> >
> > # define VEC_SIZE 32
> > +# define PAGE_SIZE 4096
> > +
>
> Remove the extra line here.
Done.
>
> >
> > .section SECTION(.text),"ax",@progbits
> > -ENTRY (MEMCHR)
> > +ENTRY(MEMCHR)
>
> No need for this change.
Fixed.
>
> > # ifndef USE_AS_RAWMEMCHR
> > /* Check for zero length. */
> > test %RDX_LP, %RDX_LP
> > jz L(null)
> > # endif
> > - movl %edi, %ecx
> > - /* Broadcast CHAR to YMM0. */
> > - vmovd %esi, %xmm0
> > # ifdef USE_AS_WMEMCHR
> > shl $2, %RDX_LP
> > - vpbroadcastd %xmm0, %ymm0
> > # else
> > # ifdef __ILP32__
> > /* Clear the upper 32 bits. */
> > movl %edx, %edx
> > # endif
> > - vpbroadcastb %xmm0, %ymm0
> > # endif
> > - /* Check if we may cross page boundary with one vector load. */
> > - andl $(2 * VEC_SIZE - 1), %ecx
> > - cmpl $VEC_SIZE, %ecx
> > - ja L(cros_page_boundary)
> > + /* Broadcast CHAR to YMMMATCH. */
> > + vmovd %esi, %xmm0
> > + VPBROADCAST %xmm0, %ymm0
> > + /* Check if we may cross page boundary with one
> > + vector load. */
> > + movl %edi, %eax
> > + andl $(PAGE_SIZE - 1), %eax
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > + ja L(cross_page_boundary)
> >
> > /* Check the first VEC_SIZE bytes. */
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > + VPCMPEQ (%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > -
> > # ifndef USE_AS_RAWMEMCHR
> > - jnz L(first_vec_x0_check)
> > - /* Adjust length and check the end of data. */
> > - subq $VEC_SIZE, %rdx
> > - jbe L(zero)
> > -# else
> > - jnz L(first_vec_x0)
> > + /* If length < CHAR_PER_VEC handle special. */
> > + cmpq $VEC_SIZE, %rdx
> > + jbe L(first_vec_x0)
> > # endif
> > -
> > - /* Align data for aligned loads in the loop. */
> > - addq $VEC_SIZE, %rdi
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > + testl %eax, %eax
> > + jz L(aligned_more)
> > + tzcntl %eax, %eax
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> >
> > # ifndef USE_AS_RAWMEMCHR
> > - /* Adjust length. */
> > - addq %rcx, %rdx
> > -
> > - subq $(VEC_SIZE * 4), %rdx
> > - jbe L(last_4x_vec_or_less)
> > + .p2align 5
> > +L(first_vec_x0):
> > + /* Check if first match was before length. */
> > + tzcntl %eax, %eax
> > + xorl %ecx, %ecx
> > + cmpl %eax, %edx
> > + leaq (%rdi, %rax), %rax
> > + cmovle %rcx, %rax
> > + VZEROUPPER_RETURN
> > +L(null):
> > + xorl %eax, %eax
> > + ret
> > # endif
> > - jmp L(more_4x_vec)
> > -
> > .p2align 4
> > -L(cros_page_boundary):
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > +L(cross_page_boundary):
> > + /* Save pointer before aligning as its original
> > + value is necessary for computer return address if byte is
> > + found or adjusting length if it is not and this is
>
> Fit comments to 72 columns.
Fixed. Still working out the kinks in my formatter.
For the 72 column fill does tab count as 1, 4, or 8 units?
>
> > + memchr. */
> > + movq %rdi, %rcx
> > + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is
> > + rcx for memchr and rdi for rawmemchr. */
> > + orq $(VEC_SIZE - 1), %ALGN_PTR_REG
> > + VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > + /* Calculate length until end of page (length
> > + checked for a match). */
> > + leaq 1(%ALGN_PTR_REG), %rsi
> > + subq %RRAW_PTR_REG, %rsi
> > +# endif
> > /* Remove the leading bytes. */
> > - sarl %cl, %eax
> > - testl %eax, %eax
> > - jz L(aligned_more)
> > - tzcntl %eax, %eax
> > + sarxl %ERAW_PTR_REG, %eax, %eax
> > # ifndef USE_AS_RAWMEMCHR
> > /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > + cmpq %rsi, %rdx
> > + jbe L(first_vec_x0)
> > # endif
> > - addq %rdi, %rax
> > - addq %rcx, %rax
> > + testl %eax, %eax
> > + jz L(cross_page_continue)
> > + tzcntl %eax, %eax
> > + addq %RRAW_PTR_REG, %rax
> > L(return_vzeroupper):
> > ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> > .p2align 4
> > -L(aligned_more):
> > -# ifndef USE_AS_RAWMEMCHR
> > - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> > - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> > - overflow. */
> > - negq %rcx
> > - addq $VEC_SIZE, %rcx
> > +L(first_vec_x1):
> > + tzcntl %eax, %eax
> > + incq %rdi
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> >
> > - /* Check the end of data. */
> > - subq %rcx, %rdx
> > - jbe L(zero)
> > -# endif
> > + .p2align 4
> > +L(first_vec_x2):
> > + tzcntl %eax, %eax
> > + addq $(VEC_SIZE + 1), %rdi
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> >
> > - addq $VEC_SIZE, %rdi
> > + .p2align 4
> > +L(first_vec_x3):
> > + tzcntl %eax, %eax
> > + addq $(VEC_SIZE * 2 + 1), %rdi
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> >
> > -# ifndef USE_AS_RAWMEMCHR
> > - subq $(VEC_SIZE * 4), %rdx
> > - jbe L(last_4x_vec_or_less)
> > -# endif
> >
> > -L(more_4x_vec):
> > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > - since data is only aligned to VEC_SIZE. */
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x0)
> > + .p2align 4
> > +L(first_vec_x4):
> > + tzcntl %eax, %eax
> > + addq $(VEC_SIZE * 3 + 1), %rdi
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> >
> > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > + .p2align 4
> > +L(aligned_more):
> > + /* Check the first 4 * VEC_SIZE. Only one
> > + VEC_SIZE at a time since data is only aligned to
> > + VEC_SIZE. */
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +L(cross_page_continue):
> > + /* Align data to VEC_SIZE - 1. */
> > + xorl %ecx, %ecx
> > + subl %edi, %ecx
> > + orq $(VEC_SIZE - 1), %rdi
> > + /* esi is for adjusting length to see if near the
> > + end. */
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > + leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> > +# else
> > + orq $(VEC_SIZE - 1), %rdi
> > +L(cross_page_continue):
> > +# endif
> > + /* Load first VEC regardless. */
> > + VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > + /* Adjust length. If near end handle specially.
> > + */
>
> Put the comments on one line.
Fixed.
>
> > + subq %rsi, %rdx
> > + jbe L(last_4x_vec_or_less)
> > +# endif
> > testl %eax, %eax
> > jnz L(first_vec_x1)
> >
> > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x2)
> >
> > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x3)
> >
> > - addq $(VEC_SIZE * 4), %rdi
> > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + testl %eax, %eax
> > + jnz L(first_vec_x4)
> >
> > # ifndef USE_AS_RAWMEMCHR
> > + /* Check if at last VEC_SIZE * 4 length. */
> > subq $(VEC_SIZE * 4), %rdx
> > - jbe L(last_4x_vec_or_less)
> > -# endif
> > -
> > - /* Align data to 4 * VEC_SIZE. */
> > - movq %rdi, %rcx
> > - andl $(4 * VEC_SIZE - 1), %ecx
> > - andq $-(4 * VEC_SIZE), %rdi
> > -
> > -# ifndef USE_AS_RAWMEMCHR
> > - /* Adjust length. */
> > + jbe L(last_4x_vec_or_less_cmpeq)
> > + /* Align data to VEC_SIZE * 4 - 1 for the loop
> > + and readjust length. */
> > + incq %rdi
> > + movl %edi, %ecx
> > + orq $(VEC_SIZE * 4 - 1), %rdi
> > + andl $(VEC_SIZE * 4 - 1), %ecx
> > addq %rcx, %rdx
> > +# else
> > + /* Align data to VEC_SIZE * 4 - 1 for loop. */
> > + incq %rdi
> > + orq $(VEC_SIZE * 4 - 1), %rdi
> > # endif
> >
> > + /* Compare 4 * VEC at a time forward. */
> > .p2align 4
> > L(loop_4x_vec):
> > - /* Compare 4 * VEC at a time forward. */
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> > -
> > + VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
> > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
> > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
> > vpor %ymm1, %ymm2, %ymm5
> > vpor %ymm3, %ymm4, %ymm6
> > vpor %ymm5, %ymm6, %ymm5
> >
> > - vpmovmskb %ymm5, %eax
> > - testl %eax, %eax
> > - jnz L(4x_vec_end)
> > -
> > - addq $(VEC_SIZE * 4), %rdi
> > -
> > + vpmovmskb %ymm5, %ecx
> > # ifdef USE_AS_RAWMEMCHR
> > - jmp L(loop_4x_vec)
> > + subq $-(VEC_SIZE * 4), %rdi
> > + testl %ecx, %ecx
> > + jz L(loop_4x_vec)
> > # else
> > - subq $(VEC_SIZE * 4), %rdx
> > - ja L(loop_4x_vec)
> > + testl %ecx, %ecx
> > + jnz L(loop_4x_vec_end)
> >
> > -L(last_4x_vec_or_less):
> > - /* Less than 4 * VEC and aligned to VEC_SIZE. */
> > - addl $(VEC_SIZE * 2), %edx
> > - jle L(last_2x_vec)
> > + subq $-(VEC_SIZE * 4), %rdi
> >
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x0)
> > + subq $(VEC_SIZE * 4), %rdx
> > + ja L(loop_4x_vec)
> >
> > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > + /* Fall through into less than 4 remaining
> > + vectors of length case. */
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > + VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > + .p2align 4
> > +L(last_4x_vec_or_less):
> > + /* Check if first VEC contained match. */
> > testl %eax, %eax
> > - jnz L(first_vec_x1)
> > + jnz L(first_vec_x1_check)
> >
> > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > + /* If remaining length > VEC_SIZE * 2. */
> > + addl $(VEC_SIZE * 2), %edx
> > + jg L(last_4x_vec)
> >
> > - jnz L(first_vec_x2_check)
> > - subl $VEC_SIZE, %edx
> > - jle L(zero)
> > +L(last_2x_vec):
> > + /* If remaining length < VEC_SIZE. */
> > + addl $VEC_SIZE, %edx
> > + jle L(zero_end)
> >
> > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > + /* Check VEC2 and compare any match with
> > + remaining length. */
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > -
> > - jnz L(first_vec_x3_check)
> > - xorl %eax, %eax
> > + tzcntl %eax, %eax
> > + cmpl %eax, %edx
> > + jbe L(set_zero_end)
> > + addq $(VEC_SIZE + 1), %rdi
> > + addq %rdi, %rax
> > +L(zero_end):
> > VZEROUPPER_RETURN
> >
> > .p2align 4
> > -L(last_2x_vec):
> > - addl $(VEC_SIZE * 2), %edx
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > +L(loop_4x_vec_end):
> > +# endif
> > + /* rawmemchr will fall through into this if match
> > + was found in loop. */
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > +
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > + jnz L(last_vec_x1_return)
> >
> > - jnz L(first_vec_x0_check)
> > - subl $VEC_SIZE, %edx
> > - jle L(zero)
> > -
> > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vpmovmskb %ymm2, %eax
> > testl %eax, %eax
> > - jnz L(first_vec_x1_check)
> > - xorl %eax, %eax
> > - VZEROUPPER_RETURN
> > + jnz L(last_vec_x2_return)
> >
> > - .p2align 4
> > -L(first_vec_x0_check):
> > - tzcntl %eax, %eax
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > + vpmovmskb %ymm3, %eax
> > + /* Combine VEC3 matches (eax) with VEC4 matches
> > + (ecx). */
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > + salq $32, %rcx
> > + orq %rcx, %rax
> > + tzcntq %rax, %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > + subq $(VEC_SIZE * 2 - 1), %rdi
> > +# else
> > + subq $-(VEC_SIZE * 2 + 1), %rdi
> > +# endif
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> > +# ifndef USE_AS_RAWMEMCHR
> >
> > .p2align 4
> > L(first_vec_x1_check):
> > tzcntl %eax, %eax
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $VEC_SIZE, %rax
> > + /* Adjust length. */
> > + subl $-(VEC_SIZE * 4), %edx
> > + /* Check if match within remaining length. */
> > + cmpl %eax, %edx
> > + jbe L(set_zero_end)
> > + incq %rdi
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> > + .p2align 4
> > +L(set_zero_end):
> > + xorl %eax, %eax
> > + VZEROUPPER_RETURN
> > +# endif
> >
> > .p2align 4
> > -L(first_vec_x2_check):
> > +L(last_vec_x1_return):
> > tzcntl %eax, %eax
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $(VEC_SIZE * 2), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > + subq $(VEC_SIZE * 4 - 1), %rdi
> > +# else
> > + incq %rdi
> > +# endif
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > .p2align 4
> > -L(first_vec_x3_check):
> > +L(last_vec_x2_return):
> > tzcntl %eax, %eax
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $(VEC_SIZE * 3), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > + subq $(VEC_SIZE * 3 - 1), %rdi
> > +# else
> > + subq $-(VEC_SIZE + 1), %rdi
> > +# endif
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > +# ifndef USE_AS_RAWMEMCHR
> > .p2align 4
> > -L(zero):
> > - xorl %eax, %eax
> > - jmp L(return_vzeroupper)
> > +L(last_4x_vec_or_less_cmpeq):
> > + VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + subq $-(VEC_SIZE * 4), %rdi
> > + /* Check first VEC regardless. */
> > + testl %eax, %eax
> > + jnz L(first_vec_x1_check)
> >
> > + /* If remaining length <= CHAR_PER_VEC * 2. */
> > + addl $(VEC_SIZE * 2), %edx
> > + jle L(last_2x_vec)
> > .p2align 4
> > -L(null):
> > - xorl %eax, %eax
> > - ret
> > -# endif
> > +L(last_4x_vec):
> > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + testl %eax, %eax
> > + jnz L(last_vec_x2_return)
> >
> > - .p2align 4
> > -L(first_vec_x0):
> > - tzcntl %eax, %eax
> > - addq %rdi, %rax
> > - VZEROUPPER_RETURN
> > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> >
> > - .p2align 4
> > -L(first_vec_x1):
> > - tzcntl %eax, %eax
> > - addq $VEC_SIZE, %rax
> > - addq %rdi, %rax
> > - VZEROUPPER_RETURN
> > + /* Create mask for possible matches within
> > + remaining length. */
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > + movq $-1, %rcx
> > + bzhiq %rdx, %rcx, %rcx
> >
> > - .p2align 4
> > -L(first_vec_x2):
> > + /* Test matches in data against length match. */
> > + andl %ecx, %eax
> > + jnz L(last_vec_x3)
> > +
> > + /* if remaining length <= VEC_SIZE * 3 (Note this
> > + is after remaining length was found to be > VEC_SIZE * 2.
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > + */
> > + subl $VEC_SIZE, %edx
> > + jbe L(zero_end2)
> > +
> > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + /* Shift remaining length mask for last VEC. */
> > + shrq $32, %rcx
> > + andl %ecx, %eax
> > + jz L(zero_end2)
> > tzcntl %eax, %eax
> > - addq $(VEC_SIZE * 2), %rax
> > + addq $(VEC_SIZE * 3 + 1), %rdi
> > addq %rdi, %rax
> > +L(zero_end2):
> > VZEROUPPER_RETURN
> >
> > .p2align 4
> > -L(4x_vec_end):
> > - vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x0)
> > - vpmovmskb %ymm2, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x1)
> > - vpmovmskb %ymm3, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x2)
> > - vpmovmskb %ymm4, %eax
> > - testl %eax, %eax
> > -L(first_vec_x3):
> > +L(last_vec_x3):
> > tzcntl %eax, %eax
> > - addq $(VEC_SIZE * 3), %rax
> > + subq $-(VEC_SIZE * 2 + 1), %rdi
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> > +# endif
> >
> > -END (MEMCHR)
> > +END(MEMCHR)
>
> No need for this change.
Fixed.
>
> > #endif
> > --
> > 2.29.2
> >
>
> Thanks.
>
> H.J.
@@ -26,8 +26,22 @@
# ifdef USE_AS_WMEMCHR
# define VPCMPEQ vpcmpeqd
+# define VPBROADCAST vpbroadcastd
+# define CHAR_SIZE 4
# else
# define VPCMPEQ vpcmpeqb
+# define VPBROADCAST vpbroadcastb
+# define CHAR_SIZE 1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+# define ERAW_PTR_REG ecx
+# define RRAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define ERAW_PTR_REG edi
+# define RRAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
# endif
# ifndef VZEROUPPER
@@ -39,303 +53,367 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+
.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY(MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
test %RDX_LP, %RDX_LP
jz L(null)
# endif
- movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
- vmovd %esi, %xmm0
# ifdef USE_AS_WMEMCHR
shl $2, %RDX_LP
- vpbroadcastd %xmm0, %ymm0
# else
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
- vpbroadcastb %xmm0, %ymm0
# endif
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ /* Broadcast CHAR to YMMMATCH. */
+ vmovd %esi, %xmm0
+ VPBROADCAST %xmm0, %ymm0
+ /* Check if we may cross page boundary with one
+ vector load. */
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
+ VPCMPEQ (%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
# ifndef USE_AS_RAWMEMCHR
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rdx
- jbe L(zero)
-# else
- jnz L(first_vec_x0)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $VEC_SIZE, %rdx
+ jbe L(first_vec_x0)
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
- addq %rcx, %rdx
-
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax), %rax
+ cmovle %rcx, %rax
+ VZEROUPPER_RETURN
+L(null):
+ xorl %eax, %eax
+ ret
# endif
- jmp L(more_4x_vec)
-
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original
+ value is necessary for computer return address if byte is
+ found or adjusting length if it is not and this is
+ memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is
+ rcx for memchr and rdi for rawmemchr. */
+ orq $(VEC_SIZE - 1), %ALGN_PTR_REG
+ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Calculate length until end of page (length
+ checked for a match). */
+ leaq 1(%ALGN_PTR_REG), %rsi
+ subq %RRAW_PTR_REG, %rsi
+# endif
/* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
+ sarxl %ERAW_PTR_REG, %eax, %eax
# ifndef USE_AS_RAWMEMCHR
/* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
# endif
- addq %rdi, %rax
- addq %rcx, %rax
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+ addq %RRAW_PTR_REG, %rax
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
- overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ incq %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- /* Check the end of data. */
- subq %rcx, %rdx
- jbe L(zero)
-# endif
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- addq $VEC_SIZE, %rdi
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 2 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
-# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
-L(more_4x_vec):
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 3 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ .p2align 4
+L(aligned_more):
+ /* Check the first 4 * VEC_SIZE. Only one
+ VEC_SIZE at a time since data is only aligned to
+ VEC_SIZE. */
+
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+ /* Align data to VEC_SIZE - 1. */
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ orq $(VEC_SIZE - 1), %rdi
+ /* esi is for adjusting length to see if near the
+ end. */
+ leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# else
+ orq $(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially.
+ */
+ subq %rsi, %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
# ifndef USE_AS_RAWMEMCHR
+ /* Check if at last VEC_SIZE * 4 length. */
subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
+ jbe L(last_4x_vec_or_less_cmpeq)
+ /* Align data to VEC_SIZE * 4 - 1 for the loop
+ and readjust length. */
+ incq %rdi
+ movl %edi, %ecx
+ orq $(VEC_SIZE * 4 - 1), %rdi
+ andl $(VEC_SIZE * 4 - 1), %ecx
addq %rcx, %rdx
+# else
+ /* Align data to VEC_SIZE * 4 - 1 for loop. */
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
# endif
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
-
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
vpor %ymm1, %ymm2, %ymm5
vpor %ymm3, %ymm4, %ymm6
vpor %ymm5, %ymm6, %ymm5
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
+ vpmovmskb %ymm5, %ecx
# ifdef USE_AS_RAWMEMCHR
- jmp L(loop_4x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
+ jz L(loop_4x_vec)
# else
- subq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec)
+ testl %ecx, %ecx
+ jnz L(loop_4x_vec_end)
-L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %edx
- jle L(last_2x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(loop_4x_vec)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ /* Fall through into less than 4 remaining
+ vectors of length case. */
+ VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+ .p2align 4
+L(last_4x_vec_or_less):
+ /* Check if first VEC contained match. */
testl %eax, %eax
- jnz L(first_vec_x1)
+ jnz L(first_vec_x1_check)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
+ /* If remaining length > VEC_SIZE * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jg L(last_4x_vec)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+L(last_2x_vec):
+ /* If remaining length < VEC_SIZE. */
+ addl $VEC_SIZE, %edx
+ jle L(zero_end)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ /* Check VEC2 and compare any match with
+ remaining length. */
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
- jnz L(first_vec_x3_check)
- xorl %eax, %eax
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+L(zero_end):
VZEROUPPER_RETURN
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %edx
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match
+ was found in loop. */
+
vpmovmskb %ymm1, %eax
testl %eax, %eax
+ jnz L(last_vec_x1_return)
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
-
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- xorl %eax, %eax
- VZEROUPPER_RETURN
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0_check):
- tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ vpmovmskb %ymm3, %eax
+ /* Combine VEC3 matches (eax) with VEC4 matches
+ (ecx). */
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 2 - 1), %rdi
+# else
+ subq $-(VEC_SIZE * 2 + 1), %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
L(first_vec_x1_check):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $VEC_SIZE, %rax
+ /* Adjust length. */
+ subl $-(VEC_SIZE * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ incq %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
+ .p2align 4
+L(set_zero_end):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
+# endif
.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x1_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 4 - 1), %rdi
+# else
+ incq %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 3 - 1), %rdi
+# else
+ subq $-(VEC_SIZE + 1), %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
-L(zero):
- xorl %eax, %eax
- jmp L(return_vzeroupper)
+L(last_4x_vec_or_less_cmpeq):
+ VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jle L(last_2x_vec)
.p2align 4
-L(null):
- xorl %eax, %eax
- ret
-# endif
+L(last_4x_vec):
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0):
- tzcntl %eax, %eax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
- .p2align 4
-L(first_vec_x1):
- tzcntl %eax, %eax
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ /* Create mask for possible matches within
+ remaining length. */
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
- .p2align 4
-L(first_vec_x2):
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= VEC_SIZE * 3 (Note this
+ is after remaining length was found to be > VEC_SIZE * 2.
+ */
+ subl $VEC_SIZE, %edx
+ jbe L(zero_end2)
+
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Shift remaining length mask for last VEC. */
+ shrq $32, %rcx
+ andl %ecx, %eax
+ jz L(zero_end2)
tzcntl %eax, %eax
- addq $(VEC_SIZE * 2), %rax
+ addq $(VEC_SIZE * 3 + 1), %rdi
addq %rdi, %rax
+L(zero_end2):
VZEROUPPER_RETURN
.p2align 4
-L(4x_vec_end):
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- vpmovmskb %ymm2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- vpmovmskb %ymm4, %eax
- testl %eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
tzcntl %eax, %eax
- addq $(VEC_SIZE * 3), %rax
+ subq $-(VEC_SIZE * 2 + 1), %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
+# endif
-END (MEMCHR)
+END(MEMCHR)
#endif