[v3,1/2] x86: Refactor and improve performance of strchr-avx2.S
Commit Message
From: noah <goldstein.w.n@gmail.com>
No bug. Just seemed the performance could be improved a bit. Observed
and expected behavior are unchanged. Optimized body of main
loop. Updated page cross logic and optimized accordingly. Made a few
minor instruction selection modifications. No regressions in test
suite. Both test-strchrnul and test-strchr passed.
Signed-off-by: noah <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
sysdeps/x86_64/multiarch/strchr.c | 1 +
2 files changed, 118 insertions(+), 118 deletions(-)
Comments
On Tue, Feb 2, 2021 at 1:40 AM <goldstein.w.n@gmail.com> wrote:
>
> From: noah <goldstein.w.n@gmail.com>
>
> No bug. Just seemed the performance could be improved a bit. Observed
> and expected behavior are unchanged. Optimized body of main
> loop. Updated page cross logic and optimized accordingly. Made a few
> minor instruction selection modifications. No regressions in test
> suite. Both test-strchrnul and test-strchr passed.
>
> Signed-off-by: noah <goldstein.w.n@gmail.com>
> ---
> sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
> sysdeps/x86_64/multiarch/strchr.c | 1 +
> 2 files changed, 118 insertions(+), 118 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> index d416558d04..806ca66a9b 100644
> --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> @@ -27,10 +27,12 @@
> # ifdef USE_AS_WCSCHR
> # define VPBROADCAST vpbroadcastd
> # define VPCMPEQ vpcmpeqd
> +# define VPMINU vpminud
> # define CHAR_REG esi
> # else
> # define VPBROADCAST vpbroadcastb
> # define VPCMPEQ vpcmpeqb
> +# define VPMINU vpminub
> # define CHAR_REG sil
> # endif
>
> @@ -39,20 +41,26 @@
> # endif
>
> # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>
> .section .text.avx,"ax",@progbits
> ENTRY (STRCHR)
> movl %edi, %ecx
> - /* Broadcast CHAR to YMM0. */
> +# ifndef USE_AS_STRCHRNUL
> + xorl %edx, %edx
> +# endif
> +
> + /* Broadcast CHAR to YMM0. */
> vmovd %esi, %xmm0
> vpxor %xmm9, %xmm9, %xmm9
> VPBROADCAST %xmm0, %ymm0
> - /* Check if we may cross page boundary with one vector load. */
> - andl $(2 * VEC_SIZE - 1), %ecx
> - cmpl $VEC_SIZE, %ecx
> - ja L(cros_page_boundary)
> -
> - /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> +
> + /* Check if we cross page boundary with one vector load. */
> + andl $(PAGE_SIZE - 1), %ecx
> + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
> + ja L(cross_page_boundary)
> +
> + /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> null byte. */
> vmovdqu (%rdi), %ymm8
> VPCMPEQ %ymm8, %ymm0, %ymm1
> @@ -60,50 +68,27 @@ ENTRY (STRCHR)
> vpor %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> - jnz L(first_vec_x0)
> -
> - /* Align data for aligned loads in the loop. */
> - addq $VEC_SIZE, %rdi
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> -
> - jmp L(more_4x_vec)
> -
> - .p2align 4
> -L(cros_page_boundary):
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> - vmovdqu (%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> - /* Remove the leading bytes. */
> - sarl %cl, %eax
> - testl %eax, %eax
> - jz L(aligned_more)
> - /* Found CHAR or the null byte. */
> + jz L(more_vecs)
> tzcntl %eax, %eax
> - addq %rcx, %rax
> -# ifdef USE_AS_STRCHRNUL
> + /* Found CHAR or the null byte. */
> addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> - leaq (%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
Remove the extra tab.
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
>
> .p2align 4
> +L(more_vecs):
> + /* Align data for aligned loads in the loop. */
> + andq $-VEC_SIZE, %rdi
> L(aligned_more):
> - addq $VEC_SIZE, %rdi
>
> -L(more_4x_vec):
> - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> - since data is only aligned to VEC_SIZE. */
> - vmovdqa (%rdi), %ymm8
> + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> + since data is only aligned to VEC_SIZE. */
> + vmovdqa VEC_SIZE(%rdi), %ymm8
> + addq $VEC_SIZE, %rdi
> VPCMPEQ %ymm8, %ymm0, %ymm1
> VPCMPEQ %ymm8, %ymm9, %ymm2
> vpor %ymm1, %ymm2, %ymm1
> @@ -125,7 +110,7 @@ L(more_4x_vec):
> vpor %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> - jnz L(first_vec_x2)
> + jnz L(first_vec_x2)
>
> vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> VPCMPEQ %ymm8, %ymm0, %ymm1
> @@ -133,122 +118,136 @@ L(more_4x_vec):
> vpor %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> - jnz L(first_vec_x3)
> -
> - addq $(VEC_SIZE * 4), %rdi
> -
> - /* Align data to 4 * VEC_SIZE. */
> - movq %rdi, %rcx
> - andl $(4 * VEC_SIZE - 1), %ecx
> - andq $-(4 * VEC_SIZE), %rdi
> -
> - .p2align 4
> -L(loop_4x_vec):
> - /* Compare 4 * VEC at a time forward. */
> - vmovdqa (%rdi), %ymm5
> - vmovdqa VEC_SIZE(%rdi), %ymm6
> - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> -
> - VPCMPEQ %ymm5, %ymm0, %ymm1
> - VPCMPEQ %ymm6, %ymm0, %ymm2
> - VPCMPEQ %ymm7, %ymm0, %ymm3
> - VPCMPEQ %ymm8, %ymm0, %ymm4
> -
> - VPCMPEQ %ymm5, %ymm9, %ymm5
> - VPCMPEQ %ymm6, %ymm9, %ymm6
> - VPCMPEQ %ymm7, %ymm9, %ymm7
> - VPCMPEQ %ymm8, %ymm9, %ymm8
> -
> - vpor %ymm1, %ymm5, %ymm1
> - vpor %ymm2, %ymm6, %ymm2
> - vpor %ymm3, %ymm7, %ymm3
> - vpor %ymm4, %ymm8, %ymm4
> -
> - vpor %ymm1, %ymm2, %ymm5
> - vpor %ymm3, %ymm4, %ymm6
> -
> - vpor %ymm5, %ymm6, %ymm5
> -
> - vpmovmskb %ymm5, %eax
> - testl %eax, %eax
> - jnz L(4x_vec_end)
> -
> - addq $(VEC_SIZE * 4), %rdi
> + jz L(prep_loop_4x)
>
> - jmp L(loop_4x_vec)
> + tzcntl %eax, %eax
> + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
Remove the extra tab.
> + cmovne %rdx, %rax
> +# endif
> + VZEROUPPER
> + ret
>
> .p2align 4
> L(first_vec_x0):
> - /* Found CHAR or the null byte. */
> tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> + /* Found CHAR or the null byte. */
> addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> - leaq (%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
Remove the extra tab.
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
> -
> +
> .p2align 4
> L(first_vec_x1):
> tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> - addq $VEC_SIZE, %rax
> - addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> leaq VEC_SIZE(%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
Remove the extra tab.
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> - ret
> -
> + ret
> +
> .p2align 4
> L(first_vec_x2):
> tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> - addq $(VEC_SIZE * 2), %rax
> - addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> + /* Found CHAR or the null byte. */
> leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
Remove the extra tab.
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
> +
> +L(prep_loop_4x):
> + /* Align data to 4 * VEC_SIZE. */
> + andq $-(VEC_SIZE * 4), %rdi
>
> .p2align 4
> -L(4x_vec_end):
> +L(loop_4x_vec):
> + /* Compare 4 * VEC at a time forward. */
> + vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> + vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> + vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> + vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> +
> + /* Leaves only CHARS matching esi as 0. */
> + vpxor %ymm5, %ymm0, %ymm1
> + vpxor %ymm6, %ymm0, %ymm2
> + vpxor %ymm7, %ymm0, %ymm3
> + vpxor %ymm8, %ymm0, %ymm4
> +
> + VPMINU %ymm1, %ymm5, %ymm1
> + VPMINU %ymm2, %ymm6, %ymm2
> + VPMINU %ymm3, %ymm7, %ymm3
> + VPMINU %ymm4, %ymm8, %ymm4
> +
> + VPMINU %ymm1, %ymm2, %ymm5
> + VPMINU %ymm3, %ymm4, %ymm6
> +
> + VPMINU %ymm5, %ymm6, %ymm5
> +
> + VPCMPEQ %ymm5, %ymm9, %ymm5
> + vpmovmskb %ymm5, %eax
> +
> + addq $(VEC_SIZE * 4), %rdi
> + testl %eax, %eax
> + jz L(loop_4x_vec)
> +
> + VPCMPEQ %ymm1, %ymm9, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x0)
> +
> + VPCMPEQ %ymm2, %ymm9, %ymm2
> vpmovmskb %ymm2, %eax
> testl %eax, %eax
> jnz L(first_vec_x1)
> - vpmovmskb %ymm3, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x2)
> +
> + VPCMPEQ %ymm3, %ymm9, %ymm3
> + VPCMPEQ %ymm4, %ymm9, %ymm4
> + vpmovmskb %ymm3, %ecx
> vpmovmskb %ymm4, %eax
> + salq $32, %rax
> + orq %rcx, %rax
Remove the extra tab.
> + tzcntq %rax, %rax
> + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
Remove the extra tab.
> + cmovne %rdx, %rax
> +# endif
> + VZEROUPPER
> + ret
> +
> + /* Cold case for crossing page with first load. */
> + .p2align 4
> +L(cross_page_boundary):
> + andq $-VEC_SIZE, %rdi
> + andl $(VEC_SIZE - 1), %ecx
> +
> + vmovdqa (%rdi), %ymm8
> + VPCMPEQ %ymm8, %ymm0, %ymm1
> + VPCMPEQ %ymm8, %ymm9, %ymm2
> + vpor %ymm1, %ymm2, %ymm1
> + vpmovmskb %ymm1, %eax
> + /* Remove the leading bits. */
> + sarxl %ecx, %eax, %eax
> testl %eax, %eax
> -L(first_vec_x3):
> + jz L(aligned_more)
> tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> - addq $(VEC_SIZE * 3), %rax
> + addq %rcx, %rdi
> addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
Remove the extra tab.
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
>
> END (STRCHR)
> -#endif
> +# endif
> diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> index 583a152794..4dfbe3b58b 100644
> --- a/sysdeps/x86_64/multiarch/strchr.c
> +++ b/sysdeps/x86_64/multiarch/strchr.c
> @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
>
> if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
> && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> return OPTIMIZE (avx2);
>
> --
> 2.29.2
>
@@ -27,10 +27,12 @@
# ifdef USE_AS_WCSCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMINU vpminud
# define CHAR_REG esi
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMINU vpminub
# define CHAR_REG sil
# endif
@@ -39,20 +41,26 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section .text.avx,"ax",@progbits
ENTRY (STRCHR)
movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
+# ifndef USE_AS_STRCHRNUL
+ xorl %edx, %edx
+# endif
+
+ /* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
vpxor %xmm9, %xmm9, %xmm9
VPBROADCAST %xmm0, %ymm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
-
- /* Check the first VEC_SIZE bytes. Search for both CHAR and the
+
+ /* Check if we cross page boundary with one vector load. */
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(cross_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
null byte. */
vmovdqu (%rdi), %ymm8
VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -60,50 +68,27 @@ ENTRY (STRCHR)
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x0)
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
-
- jmp L(more_4x_vec)
-
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- vmovdqu (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- /* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
- /* Found CHAR or the null byte. */
+ jz L(more_vecs)
tzcntl %eax, %eax
- addq %rcx, %rax
-# ifdef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
.p2align 4
+L(more_vecs):
+ /* Align data for aligned loads in the loop. */
+ andq $-VEC_SIZE, %rdi
L(aligned_more):
- addq $VEC_SIZE, %rdi
-L(more_4x_vec):
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- vmovdqa (%rdi), %ymm8
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vmovdqa VEC_SIZE(%rdi), %ymm8
+ addq $VEC_SIZE, %rdi
VPCMPEQ %ymm8, %ymm0, %ymm1
VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
@@ -125,7 +110,7 @@ L(more_4x_vec):
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x2)
+ jnz L(first_vec_x2)
vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -133,122 +118,136 @@ L(more_4x_vec):
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x3)
-
- addq $(VEC_SIZE * 4), %rdi
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
- .p2align 4
-L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm5
- vmovdqa VEC_SIZE(%rdi), %ymm6
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
-
- VPCMPEQ %ymm5, %ymm0, %ymm1
- VPCMPEQ %ymm6, %ymm0, %ymm2
- VPCMPEQ %ymm7, %ymm0, %ymm3
- VPCMPEQ %ymm8, %ymm0, %ymm4
-
- VPCMPEQ %ymm5, %ymm9, %ymm5
- VPCMPEQ %ymm6, %ymm9, %ymm6
- VPCMPEQ %ymm7, %ymm9, %ymm7
- VPCMPEQ %ymm8, %ymm9, %ymm8
-
- vpor %ymm1, %ymm5, %ymm1
- vpor %ymm2, %ymm6, %ymm2
- vpor %ymm3, %ymm7, %ymm3
- vpor %ymm4, %ymm8, %ymm4
-
- vpor %ymm1, %ymm2, %ymm5
- vpor %ymm3, %ymm4, %ymm6
-
- vpor %ymm5, %ymm6, %ymm5
-
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
+ jz L(prep_loop_4x)
- jmp L(loop_4x_vec)
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
.p2align 4
L(first_vec_x0):
- /* Found CHAR or the null byte. */
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
-
+
.p2align 4
L(first_vec_x1):
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
leaq VEC_SIZE(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
- ret
-
+ ret
+
.p2align 4
L(first_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
+ /* Found CHAR or the null byte. */
leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
+
+L(prep_loop_4x):
+ /* Align data to 4 * VEC_SIZE. */
+ andq $-(VEC_SIZE * 4), %rdi
.p2align 4
-L(4x_vec_end):
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
+ vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
+ vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
+
+ /* Leaves only CHARS matching esi as 0. */
+ vpxor %ymm5, %ymm0, %ymm1
+ vpxor %ymm6, %ymm0, %ymm2
+ vpxor %ymm7, %ymm0, %ymm3
+ vpxor %ymm8, %ymm0, %ymm4
+
+ VPMINU %ymm1, %ymm5, %ymm1
+ VPMINU %ymm2, %ymm6, %ymm2
+ VPMINU %ymm3, %ymm7, %ymm3
+ VPMINU %ymm4, %ymm8, %ymm4
+
+ VPMINU %ymm1, %ymm2, %ymm5
+ VPMINU %ymm3, %ymm4, %ymm6
+
+ VPMINU %ymm5, %ymm6, %ymm5
+
+ VPCMPEQ %ymm5, %ymm9, %ymm5
+ vpmovmskb %ymm5, %eax
+
+ addq $(VEC_SIZE * 4), %rdi
+ testl %eax, %eax
+ jz L(loop_4x_vec)
+
+ VPCMPEQ %ymm1, %ymm9, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x0)
+
+ VPCMPEQ %ymm2, %ymm9, %ymm2
vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
+
+ VPCMPEQ %ymm3, %ymm9, %ymm3
+ VPCMPEQ %ymm4, %ymm9, %ymm4
+ vpmovmskb %ymm3, %ecx
vpmovmskb %ymm4, %eax
+ salq $32, %rax
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ /* Cold case for crossing page with first load. */
+ .p2align 4
+L(cross_page_boundary):
+ andq $-VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+
+ vmovdqa (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bits. */
+ sarxl %ecx, %eax, %eax
testl %eax, %eax
-L(first_vec_x3):
+ jz L(aligned_more)
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 3), %rax
+ addq %rcx, %rdi
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
END (STRCHR)
-#endif
+# endif
@@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
return OPTIMIZE (avx2);