@@ -1129,6 +1129,11 @@ disable_tsx:
hardware. */
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
&= ~bit_arch_Avoid_Non_Temporal_Memset;
+
+ /* TZCNT performs better than BSF for string operations on Hygon
+ processors. */
+ cpu_features->preferred[index_arch_Prefer_TZCNT_for_stringop]
+ |= bit_arch_Prefer_TZCNT_for_stringop;
if (model < 0x4)
{
/* Unaligned AVX loads are slower. */
@@ -21,6 +21,7 @@ BIT (Fast_Copy_Backward)
BIT (Slow_BSF)
BIT (Fast_Unaligned_Load)
BIT (Prefer_PMINUB_for_stringop)
+BIT (Prefer_TZCNT_for_stringop)
BIT (Fast_Unaligned_Copy)
BIT (I586)
BIT (I686)
@@ -10,6 +10,7 @@ sysdep_routines += \
memcmp-avx2-movbe \
memcmp-avx2-movbe-rtm \
memcmp-evex-movbe \
+ memcmp-evex-movbe-tzcnt \
memcmp-sse2 \
memcmpeq-avx2 \
memcmpeq-avx2-rtm \
@@ -62,6 +63,7 @@ sysdep_routines += \
strchr-avx2 \
strchr-avx2-rtm \
strchr-evex \
+ strchr-evex-tzcnt \
strchr-evex512 \
strchr-sse2 \
strchr-sse2-no-bsf \
@@ -73,18 +75,21 @@ sysdep_routines += \
strcmp-avx2 \
strcmp-avx2-rtm \
strcmp-evex \
+ strcmp-evex-tzcnt \
strcmp-sse2 \
strcmp-sse2-unaligned \
strcmp-sse4_2 \
strcpy-avx2 \
strcpy-avx2-rtm \
strcpy-evex \
+ strcpy-evex-tzcnt \
strcpy-sse2 \
strcpy-sse2-unaligned \
strcspn-sse4 \
strlen-avx2 \
strlen-avx2-rtm \
strlen-evex \
+ strlen-evex-tzcnt \
strlen-evex512 \
strlen-sse2 \
strncase_l-avx2 \
@@ -99,6 +104,7 @@ sysdep_routines += \
strncmp-avx2 \
strncmp-avx2-rtm \
strncmp-evex \
+ strncmp-evex-tzcnt \
strncmp-sse2 \
strncmp-sse4_2 \
strncpy-avx2 \
@@ -25,6 +25,10 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+#ifdef USE_EVEX_TZCNT
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_tzcnt) attribute_hidden;
+#endif
+
#ifdef USE_EVEX512
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex512) attribute_hidden;
#endif
@@ -49,6 +53,10 @@ IFUNC_SELECTOR (void)
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
{
+#ifdef USE_EVEX_TZCNT
+ if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_TZCNT_for_stringop))
+ return OPTIMIZE (evex_tzcnt);
+#endif
#ifdef USE_EVEX512
if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_EVEX512))
return OPTIMIZE (evex512);
@@ -99,6 +99,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_evex_movbe)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memcmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI1)
+ && CPU_FEATURE_USABLE (BMI2)
+ && CPU_FEATURE_USABLE (MOVBE)),
+ __memcmp_evex_movbe_tzcnt)
X86_IFUNC_IMPL_ADD_V3 (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)
@@ -374,6 +381,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strlen_evex)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, strlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI1)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strlen_evex_tzcnt)
X86_IFUNC_IMPL_ADD_V4 (array, i, strlen,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
@@ -544,6 +557,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strchr_evex)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI1)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strchr_evex_tzcnt)
X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
@@ -629,6 +648,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strcmp_evex)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, strcmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI1)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strcmp_evex_tzcnt)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
@@ -656,6 +681,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strcpy_evex)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, strcpy,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI1)),
+ __strcpy_evex_tzcnt)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcpy,
CPU_FEATURE_USABLE (AVX2),
__strcpy_avx2)
@@ -1318,6 +1348,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strncmp_evex)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, strncmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI1)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strncmp_evex_tzcnt)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncmp,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
@@ -20,6 +20,9 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+#ifdef USE_EVEX_TZCNT
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe_tzcnt) attribute_hidden;
+#endif
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
@@ -39,7 +42,15 @@ IFUNC_SELECTOR (void)
{
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
- return OPTIMIZE (evex_movbe);
+ {
+#ifdef USE_EVEX_TZCNT
+ if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_TZCNT_for_stringop)
+ && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1))
+ return OPTIMIZE (evex_movbe_tzcnt);
+#endif
+
+ return OPTIMIZE (evex_movbe);
+ }
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_movbe_rtm);
@@ -21,6 +21,9 @@
#include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+#ifdef USE_EVEX_TZCNT
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_tzcnt) attribute_hidden;
+#endif
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -41,7 +44,15 @@ IFUNC_SELECTOR (void)
{
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
- return OPTIMIZE (evex);
+ {
+#ifdef USE_EVEX_TZCNT
+ if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_TZCNT_for_stringop)
+ && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1))
+ return OPTIMIZE (evex_tzcnt);
+#endif
+
+ return OPTIMIZE (evex);
+ }
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_rtm);
new file mode 100644
@@ -0,0 +1,8 @@
+#ifndef MEMCMP
+# define MEMCMP __memcmp_evex_movbe_tzcnt
+#endif
+
+#define FIND_FIRST_SET(src, dst) tzcnt src, dst
+#define FIND_FIRST_SETQ(src, dst) tzcntq src, dst
+
+#include "memcmp-evex-movbe.S"
@@ -138,7 +138,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
.p2align 4,, 11
L(return_vec_0):
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
# ifdef USE_AS_WMEMCMP
movl (%rdi, %rax, CHAR_SIZE), %ecx
xorl %edx, %edx
@@ -267,10 +267,7 @@ L(return_vec_0_1_2_3):
jnz L(return_vec_2)
.p2align 4,, 2
L(return_vec_3):
- /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
- fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
- line. */
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
xorl %edx, %edx
@@ -287,9 +284,7 @@ L(return_vec_3):
.p2align 4,, 8
L(return_vec_1):
- /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
- fetch block. */
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
# ifdef USE_AS_WMEMCMP
movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
xorl %edx, %edx
@@ -305,9 +300,7 @@ L(return_vec_1):
.p2align 4,, 7
L(return_vec_2):
- /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
- fetch block. */
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
xorl %edx, %edx
@@ -422,7 +415,7 @@ L(8x_last_1x_vec):
L(8x_return_vec_2):
subq $VEC_SIZE, %rdx
L(8x_return_vec_3):
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
# ifdef USE_AS_WMEMCMP
leaq (%rdx, %rax, CHAR_SIZE), %rax
movl (VEC_SIZE * 3)(%rax), %ecx
@@ -463,9 +456,7 @@ L(last_1x_vec):
*/
.p2align 4,, 3
L(return_vec_1_end):
- /* Use bsf to save code size. This is necessary to have
- L(one_or_less) fit in aligning bytes between. */
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
addl %edx, %eax
# ifdef USE_AS_WMEMCMP
movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
@@ -484,7 +475,7 @@ L(return_vec_1_end):
/* Don't align. Takes 2-fetch blocks either way and aligning
will cause code to spill into another cacheline. */
L(return_vec_0_end):
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
addl %edx, %eax
# ifdef USE_AS_WMEMCMP
movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
@@ -24,6 +24,7 @@
# undef memcmp
# define SYMBOL_NAME memcmp
+# define USE_EVEX_TZCNT 1
# include "ifunc-memcmp.h"
libc_ifunc_redirected (__redirect_memcmp, memcmp, IFUNC_SELECTOR ());
new file mode 100644
@@ -0,0 +1,8 @@
+#ifndef STRCHR
+# define STRCHR __strchr_evex_tzcnt
+#endif
+
+#define FIND_FIRST_SET(src, dst) tzcnt src, dst
+#define FIND_FIRST_SETQ(src, dst) tzcntq src, dst
+
+#include "strchr-evex.S"
@@ -116,12 +116,12 @@ ENTRY_P2ALIGN (STRCHR, 6)
/* If VEC_SIZE == 64 && STRCHRNUL use bsf to test condition so
that all logic for match/null in first VEC first in 1x cache
lines. This has a slight cost to larger sizes. */
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
jz L(aligned_more)
# else
test %VRAX, %VRAX
jz L(aligned_more)
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
# endif
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
@@ -161,7 +161,7 @@ L(last_vec_x2):
L(first_vec_x1):
/* Use bsf here to save 1-byte keeping keeping the block in 1x
fetch block. eax guaranteed non-zero. */
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
cmp (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %CHAR_REG
@@ -188,7 +188,7 @@ L(first_vec_x2):
/* Combine CHAR and null matches. */
KOR %k0, %k1, %k0
KMOV %k0, %VRAX
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
# endif
/* NB: Multiply sizeof char type (1 or 4) to get the number of
bytes. */
@@ -384,7 +384,7 @@ L(loop_4x_vec):
orq %rcx, %rdx
# endif
- bsfq %rdx, %rdx
+ FIND_FIRST_SETQ (%rdx, %rdx)
# ifndef USE_AS_STRCHRNUL
/* Check if match was CHAR or null. */
@@ -409,7 +409,7 @@ L(zero_end):
# if VEC_SIZE == 64
.p2align 4,, 8
L(last_vec_x1_vec_size64):
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
# ifndef USE_AS_STRCHRNUL
/* Check if match was null. */
cmp (%rax, %rcx, CHAR_SIZE), %CHAR_REG
@@ -435,7 +435,7 @@ L(last_vec_x3):
/* Duplicate L(last_vec_x2) for VEC_SIZE == 64 because we can't
reuse L(first_vec_x3) due to register mismatch. */
L(last_vec_x2):
- bsf %VGPR(MASK_GPR), %VGPR(MASK_GPR)
+ FIND_FIRST_SET (%VGPR(MASK_GPR), %VGPR(MASK_GPR))
# ifndef USE_AS_STRCHRNUL
/* Check if match was null. */
cmp (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %CHAR_REG
@@ -493,7 +493,7 @@ L(cross_page_boundary_real):
.p2align 4,, 10
L(last_vec_x1_vec_size32):
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
# ifdef USE_AS_WCSCHR
/* NB: Multiply wchar_t count by 4 to get the number of bytes.
*/
@@ -27,6 +27,7 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_tzcnt) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex512) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
@@ -48,6 +49,10 @@ IFUNC_SELECTOR (void)
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
{
+ if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_TZCNT_for_stringop)
+ && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1))
+ return OPTIMIZE (evex_tzcnt);
+
if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_EVEX512))
return OPTIMIZE (evex512);
new file mode 100644
@@ -0,0 +1,5 @@
+#define STRCMP_ISA _evex_tzcnt
+#define FIND_FIRST_SET(src, dst) tzcnt src, dst
+#define FIND_FIRST_SETQ(src, dst) tzcntq src, dst
+
+#include "strcmp-evex.S"
@@ -24,7 +24,9 @@
# include "x86-evex256-vecs.h"
# endif
-# define STRCMP_ISA _evex
+# ifndef STRCMP_ISA
+# define STRCMP_ISA _evex
+# endif
# include "strcmp-naming.h"
# include <sysdep.h>
@@ -335,7 +337,7 @@ L(no_page_cross):
.p2align 4,, 4
L(return_vec_0):
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
# ifdef USE_AS_WCSCMP
movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
xorl %eax, %eax
@@ -406,7 +408,7 @@ L(ret1):
.p2align 4,, 10
L(return_vec_1):
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
# ifdef USE_AS_STRNCMP
/* rdx must be > CHAR_PER_VEC so its safe to subtract without
worrying about underflow. */
@@ -447,7 +449,7 @@ L(return_vec_3):
# else
/* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
check it. */
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
addl $(CHAR_PER_VEC), %ecx
cmpq %rcx, %rdx
ja L(ret_vec_3_finish)
@@ -460,9 +462,9 @@ L(return_vec_3):
2x VEC so need separate return label. */
L(return_vec_2):
# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
# else
- bsfq %rcx, %rcx
+ FIND_FIRST_SETQ (%rcx, %rcx)
# endif
# ifdef USE_AS_STRNCMP
cmpq %rcx, %rdx
@@ -491,7 +493,7 @@ L(ret3):
# ifndef USE_AS_STRNCMP
.p2align 4,, 10
L(return_vec_3):
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
# ifdef USE_AS_WCSCMP
movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
xorl %eax, %eax
@@ -703,9 +705,9 @@ L(return_vec_2_3_end):
mismatch must entirely be from VEC 3 which is fully
represented by LOOP_REG. */
# if CHAR_PER_VEC <= 16
- bsf %LOOP_REG, %LOOP_REG
+ FIND_FIRST_SET (%LOOP_REG, %LOOP_REG)
# else
- bsfq %LOOP_REG64, %LOOP_REG64
+ FIND_FIRST_SETQ (%LOOP_REG64, %LOOP_REG64)
# endif
# ifdef USE_AS_STRNCMP
@@ -768,7 +770,7 @@ L(return_vec_1_end):
# else
/* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
check it. */
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
addl $(CHAR_PER_VEC), %ecx
cmpq %rcx, %rdx
ja L(ret_vec_0_end_finish)
@@ -778,9 +780,9 @@ L(return_vec_1_end):
# endif
L(return_vec_0_end):
# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
# else
- bsfq %rcx, %rcx
+ FIND_FIRST_SETQ (%rcx, %rcx)
# endif
# ifdef USE_AS_STRNCMP
@@ -816,7 +818,7 @@ L(ret6):
# ifndef USE_AS_STRNCMP
.p2align 4,, 10
L(return_vec_1_end):
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
# ifdef USE_AS_WCSCMP
movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
xorl %eax, %eax
@@ -843,7 +845,7 @@ L(ret7):
2x VEC so need separate return label. */
# if CHAR_PER_VEC == 64
L(return_vec_2_end):
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
# ifdef USE_AS_STRNCMP
cmpq %rcx, %rdx
jbe L(ret_zero_end)
@@ -963,7 +965,7 @@ L(less_1x_vec_till_page_cross):
and %VR10, %VRCX
jz L(loop_skip_page_cross_check)
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
@@ -1104,7 +1106,7 @@ L(ret_zero_in_loop_page_cross):
L(return_vec_page_cross_0):
addl $-VEC_SIZE, %eax
L(return_vec_page_cross_1):
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
# ifdef USE_AS_STRNCMP
@@ -27,6 +27,7 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_tzcnt) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -51,7 +52,13 @@ IFUNC_SELECTOR (void)
{
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
- return OPTIMIZE (evex);
+ {
+ if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_TZCNT_for_stringop)
+ && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1))
+ return OPTIMIZE (evex_tzcnt);
+
+ return OPTIMIZE (evex);
+ }
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_rtm);
new file mode 100644
@@ -0,0 +1,7 @@
+#ifndef STRCPY
+# define STRCPY __strcpy_evex_tzcnt
+#endif
+
+#define FIND_FIRST_SET(src, dst) tzcnt src, dst
+
+#include "strcpy-evex.S"
@@ -121,7 +121,7 @@ L(page_cross_continue):
VMOVU_MASK %VMM(0), (%rdi){%k1}
# ifdef USE_AS_STPCPY
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
leaq (%rdi, %rcx, CHAR_SIZE), %rax
# endif
ret
@@ -133,7 +133,7 @@ L(page_cross_continue):
jz L(more_1x_vec)
xorl %edx, %edx
- bsf %VRCX, %VRDX
+ FIND_FIRST_SET (%VRCX, %VRDX)
# ifdef USE_AS_STPCPY
leaq (%rdi, %rdx, CHAR_SIZE), %rax
# endif
@@ -351,7 +351,7 @@ L(loop_4x_done):
meaningfuly benefit doing this for stpcpy. */
KMOV %k4, %VRDX
L(ret_vec_x3):
- bsf %VRDX, %VRDX
+ FIND_FIRST_SET (%VRDX, %VRDX)
VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
@@ -362,7 +362,7 @@ L(return_end):
.p2align 4,, 6
L(ret_vec_x0_end):
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
# ifdef USE_AS_STPCPY
leaq (%rdi, %rcx, CHAR_SIZE), %rax
# endif
@@ -373,7 +373,7 @@ L(ret_vec_x0_end):
.p2align 4,, 8
L(ret_vec_x1):
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
VMOVU (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
@@ -383,7 +383,7 @@ L(ret_vec_x1):
.p2align 4,, 4
L(ret_vec_x2):
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
@@ -394,7 +394,7 @@ L(ret_vec_x2):
/* ret_vec_x3 reuses return code after the loop. */
.p2align 4,, 6
L(ret_vec_x4):
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
# ifdef USE_AS_STPCPY
@@ -432,7 +432,7 @@ L(page_cross):
# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
movq %rdi, %rax
# endif
- bsf %VRCX, %VRCX
+ FIND_FIRST_SET (%VRCX, %VRCX)
REP_MOVS
# ifdef USE_AS_STPCPY
@@ -454,7 +454,7 @@ L(page_cross):
xorl %edx, %edx
# endif
/* Dependency on rdi must already have been satisfied. */
- bsf %VRCX, %VRDX
+ FIND_FIRST_SET (%VRCX, %VRDX)
# ifdef USE_AS_STPCPY
leaq (%rdi, %rdx, CHAR_SIZE), %rax
# elif !defined USE_AS_STRCAT
@@ -24,6 +24,7 @@
# undef strcpy
# define SYMBOL_NAME strcpy
+# define USE_EVEX_TZCNT 1
# include "ifunc-strcpy.h"
libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ());
@@ -82,12 +82,12 @@ ENTRY_P2ALIGN(STRLEN, 6)
KMOV %k0, %VRAX
test %VRAX, %VRAX
jz L(aligned_more)
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
ret
.p2align 4,, 8
L(first_vec_x4):
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
subl %ecx, %edi
CHAR_SIZE_SHIFT_REG (edi)
leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
@@ -178,7 +178,7 @@ L(loop_4x_vec):
/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. */
.p2align 4,, 2
L(FALLTHROUGH_RETURN_LBL):
- bsfq %rax, %rax
+ FIND_FIRST_SETQ (%rax, %rax)
subq %rcx, %rdi
CHAR_SIZE_SHIFT_REG (rdi)
leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
@@ -186,7 +186,7 @@ L(FALLTHROUGH_RETURN_LBL):
.p2align 4,, 8
L(first_vec_x0):
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
sub %rcx, %rdi
CHAR_SIZE_SHIFT_REG (rdi)
addq %rdi, %rax
@@ -194,7 +194,7 @@ L(first_vec_x0):
.p2align 4,, 10
L(first_vec_x1):
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
sub %rcx, %rdi
CHAR_SIZE_SHIFT_REG (rdi)
leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
@@ -203,7 +203,7 @@ L(first_vec_x1):
.p2align 4,, 10
/* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. */
L(TAIL_RETURN_LBL):
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
sub %VRCX, %VRDI
CHAR_SIZE_SHIFT_REG (VRDI)
lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
@@ -228,7 +228,7 @@ L(cross_page_boundary):
shr %cl, %VRAX
# endif
jz L(cross_page_continue)
- bsf %VRAX, %VRAX
+ FIND_FIRST_SET (%VRAX, %VRAX)
ret
END(STRLEN)
new file mode 100644
@@ -0,0 +1,10 @@
+#ifndef STRLEN
+# define STRLEN __strlen_evex_tzcnt
+#endif
+
+#define FIND_FIRST_SET(src, dst) tzcnt src, dst
+#define FIND_FIRST_SETQ(src, dst) tzcntq src, dst
+
+#include "x86-evex256-vecs.h"
+#include "reg-macros.h"
+#include "strlen-evex-base.S"
@@ -24,6 +24,7 @@
# undef strlen
# define SYMBOL_NAME strlen
+# define USE_EVEX_TZCNT 1
# define USE_EVEX512 1
# include "ifunc-avx2.h"
new file mode 100644
@@ -0,0 +1,6 @@
+#define STRCMP_ISA _evex_tzcnt
+#define FIND_FIRST_SET(src, dst) tzcnt src, dst
+#define FIND_FIRST_SETQ(src, dst) tzcntq src, dst
+#define USE_AS_STRNCMP 1
+
+#include "strcmp-evex.S"
@@ -27,6 +27,7 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_tzcnt) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -47,7 +48,13 @@ IFUNC_SELECTOR (void)
{
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
&& X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
- return OPTIMIZE (evex);
+ {
+ if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_TZCNT_for_stringop)
+ && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1))
+ return OPTIMIZE (evex_tzcnt);
+
+ return OPTIMIZE (evex);
+ }
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_rtm);
@@ -175,6 +175,14 @@ lose: \
# define VZEROUPPER_RETURN VZEROUPPER; ret
#endif
+#ifndef FIND_FIRST_SET
+# define FIND_FIRST_SET(src, dst) bsf src, dst
+#endif
+
+#ifndef FIND_FIRST_SETQ
+# define FIND_FIRST_SETQ(src, dst) bsfq src, dst
+#endif
+
#else /* __ASSEMBLER__ */
/* Instruction to operate on long and pointer. */