[v2,1/2] x86: Refactor and improve performance of strchr-avx2.S
Commit Message
No bug. Just seemed the performance could be improved a bit. Observed
and expected behavior are unchanged. Optimized body of main
loop. Updated page cross logic and optimized accordingly. Made a few
minor instruction selection modifications. No regressions in test
suite. Both test-strchrnul and test-strchr passed.
Signed-off-by: noah <goldstein.w.n@gmail.com>
---
Since V1 optimized more around smaller lengths. The original version
expected the 4x loop to be hit though the benchmarks in bench-strchr.c
indicate optimization for very short strings is most important.
Made the first 32 byte check expect to find either the end of the
string or character in question. As well increased number of vectors
in L(aligned_more) to 4. This does cost for most alignments if the 4x
loop is hit but is faster for strings < 128 byte.
sysdeps/x86_64/multiarch/strchr-avx2.S | 247 ++++++++++++-------------
sysdeps/x86_64/multiarch/strchr.c | 1 +
2 files changed, 124 insertions(+), 124 deletions(-)
Comments
On Sun, Jan 31, 2021 at 4:30 PM noah <goldstein.w.n@gmail.com> wrote:
>
> No bug. Just seemed the performance could be improved a bit. Observed
> and expected behavior are unchanged. Optimized body of main
> loop. Updated page cross logic and optimized accordingly. Made a few
> minor instruction selection modifications. No regressions in test
> suite. Both test-strchrnul and test-strchr passed.
>
> Signed-off-by: noah <goldstein.w.n@gmail.com>
> ---
> Since V1 optimized more around smaller lengths. The original version
> expected the 4x loop to be hit though the benchmarks in bench-strchr.c
> indicate optimization for very short strings is most important.
>
> Made the first 32 byte check expect to find either the end of the
> string or character in question. As well increased number of vectors
> in L(aligned_more) to 4. This does cost for most alignments if the 4x
> loop is hit but is faster for strings < 128 byte.
>
> sysdeps/x86_64/multiarch/strchr-avx2.S | 247 ++++++++++++-------------
> sysdeps/x86_64/multiarch/strchr.c | 1 +
> 2 files changed, 124 insertions(+), 124 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> index d416558d04..3012cb6ece 100644
> --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> @@ -27,10 +27,12 @@
> # ifdef USE_AS_WCSCHR
> # define VPBROADCAST vpbroadcastd
> # define VPCMPEQ vpcmpeqd
> +# define VPMINU vpminud
> # define CHAR_REG esi
> # else
> # define VPBROADCAST vpbroadcastb
> # define VPCMPEQ vpcmpeqb
> +# define VPMINU vpminub
> # define CHAR_REG sil
> # endif
>
> @@ -39,19 +41,25 @@
> # endif
>
> # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>
> .section .text.avx,"ax",@progbits
> ENTRY (STRCHR)
> - movl %edi, %ecx
> + movl %edi, %ecx
You replaced a tab with 8 spaces. Please put the tab back
and replace 8 spaces with a tab.
> +# ifndef USE_AS_STRCHRNUL
> + xorl %edx, %edx
> +# endif
> +
> /* Broadcast CHAR to YMM0. */
> vmovd %esi, %xmm0
> vpxor %xmm9, %xmm9, %xmm9
> VPBROADCAST %xmm0, %ymm0
> - /* Check if we may cross page boundary with one vector load. */
> - andl $(2 * VEC_SIZE - 1), %ecx
> - cmpl $VEC_SIZE, %ecx
> - ja L(cros_page_boundary)
> -
> +
> + /* Check if we cross page boundary with one vector load. */
> + andl $(PAGE_SIZE - 1), %ecx
> + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
> + ja L(cross_page_boundary)
> +
> /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> null byte. */
> vmovdqu (%rdi), %ymm8
> @@ -60,50 +68,27 @@ ENTRY (STRCHR)
> vpor %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> - jnz L(first_vec_x0)
> -
> - /* Align data for aligned loads in the loop. */
> - addq $VEC_SIZE, %rdi
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> -
> - jmp L(more_4x_vec)
> -
> - .p2align 4
> -L(cros_page_boundary):
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> - vmovdqu (%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> - /* Remove the leading bytes. */
> - sarl %cl, %eax
> - testl %eax, %eax
> - jz L(aligned_more)
> + jz L(more_vecs)
> + tzcntl %eax, %eax
> /* Found CHAR or the null byte. */
> - tzcntl %eax, %eax
> - addq %rcx, %rax
> -# ifdef USE_AS_STRCHRNUL
> addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> - leaq (%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
>
> - .p2align 4
> + .p2align 4
> +L(more_vecs):
> + /* Align data for aligned loads in the loop. */
> + andq $-VEC_SIZE, %rdi
> L(aligned_more):
> - addq $VEC_SIZE, %rdi
>
> -L(more_4x_vec):
> - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> - since data is only aligned to VEC_SIZE. */
> - vmovdqa (%rdi), %ymm8
> + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> + since data is only aligned to VEC_SIZE. */
> + vmovdqa VEC_SIZE(%rdi), %ymm8
> + addq $VEC_SIZE, %rdi
> VPCMPEQ %ymm8, %ymm0, %ymm1
> VPCMPEQ %ymm8, %ymm9, %ymm2
> vpor %ymm1, %ymm2, %ymm1
> @@ -125,7 +110,7 @@ L(more_4x_vec):
> vpor %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> - jnz L(first_vec_x2)
> + jnz L(first_vec_x2)
>
> vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> VPCMPEQ %ymm8, %ymm0, %ymm1
> @@ -133,122 +118,136 @@ L(more_4x_vec):
> vpor %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> - jnz L(first_vec_x3)
> -
> - addq $(VEC_SIZE * 4), %rdi
> -
> - /* Align data to 4 * VEC_SIZE. */
> - movq %rdi, %rcx
> - andl $(4 * VEC_SIZE - 1), %ecx
> - andq $-(4 * VEC_SIZE), %rdi
> -
> - .p2align 4
> -L(loop_4x_vec):
> - /* Compare 4 * VEC at a time forward. */
> - vmovdqa (%rdi), %ymm5
> - vmovdqa VEC_SIZE(%rdi), %ymm6
> - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> -
> - VPCMPEQ %ymm5, %ymm0, %ymm1
> - VPCMPEQ %ymm6, %ymm0, %ymm2
> - VPCMPEQ %ymm7, %ymm0, %ymm3
> - VPCMPEQ %ymm8, %ymm0, %ymm4
> -
> - VPCMPEQ %ymm5, %ymm9, %ymm5
> - VPCMPEQ %ymm6, %ymm9, %ymm6
> - VPCMPEQ %ymm7, %ymm9, %ymm7
> - VPCMPEQ %ymm8, %ymm9, %ymm8
> -
> - vpor %ymm1, %ymm5, %ymm1
> - vpor %ymm2, %ymm6, %ymm2
> - vpor %ymm3, %ymm7, %ymm3
> - vpor %ymm4, %ymm8, %ymm4
> -
> - vpor %ymm1, %ymm2, %ymm5
> - vpor %ymm3, %ymm4, %ymm6
> + jz L(prep_loop_4x)
>
> - vpor %ymm5, %ymm6, %ymm5
> -
> - vpmovmskb %ymm5, %eax
> - testl %eax, %eax
> - jnz L(4x_vec_end)
> -
> - addq $(VEC_SIZE * 4), %rdi
> -
> - jmp L(loop_4x_vec)
> + tzcntl %eax, %eax
> + leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
> + cmovne %rdx, %rax
> +# endif
> + VZEROUPPER
> + ret
>
> - .p2align 4
> + .p2align 4
> L(first_vec_x0):
> + tzcntl %eax, %eax
> /* Found CHAR or the null byte. */
> - tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> - leaq (%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
> -
> +
> .p2align 4
> L(first_vec_x1):
> - tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> - addq $VEC_SIZE, %rax
> - addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> - leaq VEC_SIZE(%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> + tzcntl %eax, %eax
> + leaq VEC_SIZE(%rdi, %rax), %rax
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> - ret
> -
> - .p2align 4
> + ret
> +
> + .p2align 4
> L(first_vec_x2):
> - tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> - addq $(VEC_SIZE * 2), %rax
> - addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> + tzcntl %eax, %eax
> + /* Found CHAR or the null byte. */
> leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
> +
> +L(prep_loop_4x):
> + /* Align data to 4 * VEC_SIZE. */
> + andq $-(VEC_SIZE * 4), %rdi
>
> .p2align 4
> -L(4x_vec_end):
> +L(loop_4x_vec):
> + /* Compare 4 * VEC at a time forward. */
> + vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> + vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> + vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> + vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> +
> + /* Leaves only CHARS matching esi as 0. */
> + vpxor %ymm5, %ymm0, %ymm1
> + vpxor %ymm6, %ymm0, %ymm2
> + vpxor %ymm7, %ymm0, %ymm3
> + vpxor %ymm8, %ymm0, %ymm4
> +
> + VPMINU %ymm1, %ymm5, %ymm1
> + VPMINU %ymm2, %ymm6, %ymm2
> + VPMINU %ymm3, %ymm7, %ymm3
> + VPMINU %ymm4, %ymm8, %ymm4
> +
> + VPMINU %ymm1, %ymm2, %ymm5
> + VPMINU %ymm3, %ymm4, %ymm6
> +
> + VPMINU %ymm5, %ymm6, %ymm5
> +
> + VPCMPEQ %ymm5, %ymm9, %ymm5
> + vpmovmskb %ymm5, %eax
> +
> + addq $(VEC_SIZE * 4), %rdi
> + testl %eax, %eax
> + jz L(loop_4x_vec)
> +
> + VPCMPEQ %ymm1, %ymm9, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x0)
> - vpmovmskb %ymm2, %eax
> +
> + VPCMPEQ %ymm2, %ymm9, %ymm2
> + vpmovmskb %ymm2, %eax
> testl %eax, %eax
> jnz L(first_vec_x1)
> - vpmovmskb %ymm3, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x2)
> +
> + VPCMPEQ %ymm3, %ymm9, %ymm3
> + VPCMPEQ %ymm4, %ymm9, %ymm4
> + vpmovmskb %ymm3, %ecx
> vpmovmskb %ymm4, %eax
> + salq $32, %rax
> + orq %rcx, %rax
> + tzcntq %rax, %rax
> + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
> + cmovne %rdx, %rax
> +# endif
> + VZEROUPPER
> + ret
> +
> + /* Cold case for crossing page with first load. */
> + .p2align 4
> +L(cross_page_boundary):
> + andq $-VEC_SIZE, %rdi
> + andl $(VEC_SIZE - 1), %ecx
> +
> + vmovdqa (%rdi), %ymm8
> + VPCMPEQ %ymm8, %ymm0, %ymm1
> + VPCMPEQ %ymm8, %ymm9, %ymm2
> + vpor %ymm1, %ymm2, %ymm1
> + vpmovmskb %ymm1, %eax
> + /* Remove the leading bits. */
> + sarxl %ecx, %eax, %eax
> testl %eax, %eax
> -L(first_vec_x3):
> + jz L(aligned_more)
> tzcntl %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> - addq $(VEC_SIZE * 3), %rax
> + addq %rcx, %rdi
> addq %rdi, %rax
> -# else
> - xorl %edx, %edx
> - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
> - cmp (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> + cmp (%rax), %CHAR_REG
> cmovne %rdx, %rax
> # endif
> VZEROUPPER
> ret
>
> END (STRCHR)
> -#endif
> +# endif
> diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> index 583a152794..4dfbe3b58b 100644
> --- a/sysdeps/x86_64/multiarch/strchr.c
> +++ b/sysdeps/x86_64/multiarch/strchr.c
> @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
>
> if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
> && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> return OPTIMIZE (avx2);
>
> --
> 2.29.2
>
@@ -27,10 +27,12 @@
# ifdef USE_AS_WCSCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMINU vpminud
# define CHAR_REG esi
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMINU vpminub
# define CHAR_REG sil
# endif
@@ -39,19 +41,25 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section .text.avx,"ax",@progbits
ENTRY (STRCHR)
- movl %edi, %ecx
+ movl %edi, %ecx
+# ifndef USE_AS_STRCHRNUL
+ xorl %edx, %edx
+# endif
+
/* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
vpxor %xmm9, %xmm9, %xmm9
VPBROADCAST %xmm0, %ymm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
-
+
+ /* Check if we cross page boundary with one vector load. */
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(cross_page_boundary)
+
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
null byte. */
vmovdqu (%rdi), %ymm8
@@ -60,50 +68,27 @@ ENTRY (STRCHR)
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x0)
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
-
- jmp L(more_4x_vec)
-
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- vmovdqu (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- /* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
+ jz L(more_vecs)
+ tzcntl %eax, %eax
/* Found CHAR or the null byte. */
- tzcntl %eax, %eax
- addq %rcx, %rax
-# ifdef USE_AS_STRCHRNUL
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
- .p2align 4
+ .p2align 4
+L(more_vecs):
+ /* Align data for aligned loads in the loop. */
+ andq $-VEC_SIZE, %rdi
L(aligned_more):
- addq $VEC_SIZE, %rdi
-L(more_4x_vec):
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- vmovdqa (%rdi), %ymm8
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vmovdqa VEC_SIZE(%rdi), %ymm8
+ addq $VEC_SIZE, %rdi
VPCMPEQ %ymm8, %ymm0, %ymm1
VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
@@ -125,7 +110,7 @@ L(more_4x_vec):
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x2)
+ jnz L(first_vec_x2)
vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -133,122 +118,136 @@ L(more_4x_vec):
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x3)
-
- addq $(VEC_SIZE * 4), %rdi
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
- .p2align 4
-L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm5
- vmovdqa VEC_SIZE(%rdi), %ymm6
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
-
- VPCMPEQ %ymm5, %ymm0, %ymm1
- VPCMPEQ %ymm6, %ymm0, %ymm2
- VPCMPEQ %ymm7, %ymm0, %ymm3
- VPCMPEQ %ymm8, %ymm0, %ymm4
-
- VPCMPEQ %ymm5, %ymm9, %ymm5
- VPCMPEQ %ymm6, %ymm9, %ymm6
- VPCMPEQ %ymm7, %ymm9, %ymm7
- VPCMPEQ %ymm8, %ymm9, %ymm8
-
- vpor %ymm1, %ymm5, %ymm1
- vpor %ymm2, %ymm6, %ymm2
- vpor %ymm3, %ymm7, %ymm3
- vpor %ymm4, %ymm8, %ymm4
-
- vpor %ymm1, %ymm2, %ymm5
- vpor %ymm3, %ymm4, %ymm6
+ jz L(prep_loop_4x)
- vpor %ymm5, %ymm6, %ymm5
-
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
- jmp L(loop_4x_vec)
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
- .p2align 4
+ .p2align 4
L(first_vec_x0):
+ tzcntl %eax, %eax
/* Found CHAR or the null byte. */
- tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
-
+
.p2align 4
L(first_vec_x1):
- tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq VEC_SIZE(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+ tzcntl %eax, %eax
+ leaq VEC_SIZE(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
- ret
-
- .p2align 4
+ ret
+
+ .p2align 4
L(first_vec_x2):
- tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
+ tzcntl %eax, %eax
+ /* Found CHAR or the null byte. */
leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
+
+L(prep_loop_4x):
+ /* Align data to 4 * VEC_SIZE. */
+ andq $-(VEC_SIZE * 4), %rdi
.p2align 4
-L(4x_vec_end):
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
+ vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
+ vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
+
+ /* Leaves only CHARS matching esi as 0. */
+ vpxor %ymm5, %ymm0, %ymm1
+ vpxor %ymm6, %ymm0, %ymm2
+ vpxor %ymm7, %ymm0, %ymm3
+ vpxor %ymm8, %ymm0, %ymm4
+
+ VPMINU %ymm1, %ymm5, %ymm1
+ VPMINU %ymm2, %ymm6, %ymm2
+ VPMINU %ymm3, %ymm7, %ymm3
+ VPMINU %ymm4, %ymm8, %ymm4
+
+ VPMINU %ymm1, %ymm2, %ymm5
+ VPMINU %ymm3, %ymm4, %ymm6
+
+ VPMINU %ymm5, %ymm6, %ymm5
+
+ VPCMPEQ %ymm5, %ymm9, %ymm5
+ vpmovmskb %ymm5, %eax
+
+ addq $(VEC_SIZE * 4), %rdi
+ testl %eax, %eax
+ jz L(loop_4x_vec)
+
+ VPCMPEQ %ymm1, %ymm9, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x0)
- vpmovmskb %ymm2, %eax
+
+ VPCMPEQ %ymm2, %ymm9, %ymm2
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
+
+ VPCMPEQ %ymm3, %ymm9, %ymm3
+ VPCMPEQ %ymm4, %ymm9, %ymm4
+ vpmovmskb %ymm3, %ecx
vpmovmskb %ymm4, %eax
+ salq $32, %rax
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ /* Cold case for crossing page with first load. */
+ .p2align 4
+L(cross_page_boundary):
+ andq $-VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+
+ vmovdqa (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bits. */
+ sarxl %ecx, %eax, %eax
testl %eax, %eax
-L(first_vec_x3):
+ jz L(aligned_more)
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 3), %rax
+ addq %rcx, %rdi
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER
ret
END (STRCHR)
-#endif
+# endif
@@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
return OPTIMIZE (avx2);