[RFC,v1,1/1] x86: Add TZCNT variants for selected Hygon string functions

Message ID 20260528081950.1417009-2-xiejiamei@hygon.cn (mailing list archive)
State New
Headers
Series x86: Use TZCNT-based EVEX string variants on Hygon |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-arm warning Skipped because it is an RFC
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 warning Skipped because it is an RFC

Commit Message

Jiamei Xie May 28, 2026, 8:19 a.m. UTC
  On Hygon processors, TZCNT is faster than BSF for finding the first
set bit in hot string routines.  Add Prefer_TZCNT_for_stringop and use
it to select TZCNT-based EVEX variants of strlen, strcpy, strchr,
strcmp, strncmp and memcmp.

The new variants reuse the existing assembly sources through
FIND_FIRST_SET/FIND_FIRST_SETQ macros, which default to bsf/bsfq and
are overridden by the TZCNT wrappers.  Default dispatch for non-Hygon
processors is unchanged.

Benchmark results relative to the existing EVEX variants:

Function          Hygon model 7   Hygon model 8
__strlen_evex        +12.32%         +11.89%
__strcpy_evex         +5.83%          +4.25%
__strchr_evex         +9.21%          +9.76%
__strcmp_evex         +6.28%          +7.79%
__memcmp_evex        +10.30%         +10.99%
__strncmp_evex        +4.26%          +4.92%

Change-Id: Icd127e51536ad23be774a02335eced8de523939a
Signed-off-by: Jiamei Xie <xiejiamei@hygon.cn>
---
 sysdeps/x86/cpu-features.c                    |  5 +++
 ...cpu-features-preferred_feature_index_1.def |  1 +
 sysdeps/x86_64/multiarch/Makefile             |  6 ++++
 sysdeps/x86_64/multiarch/ifunc-avx2.h         |  8 +++++
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 36 +++++++++++++++++++
 sysdeps/x86_64/multiarch/ifunc-memcmp.h       | 13 ++++++-
 sysdeps/x86_64/multiarch/ifunc-strcpy.h       | 13 ++++++-
 .../multiarch/memcmp-evex-movbe-tzcnt.S       |  8 +++++
 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S  | 23 ++++--------
 sysdeps/x86_64/multiarch/memcmp.c             |  1 +
 sysdeps/x86_64/multiarch/strchr-evex-tzcnt.S  |  8 +++++
 sysdeps/x86_64/multiarch/strchr-evex.S        | 16 ++++-----
 sysdeps/x86_64/multiarch/strchr.c             |  5 +++
 sysdeps/x86_64/multiarch/strcmp-evex-tzcnt.S  |  5 +++
 sysdeps/x86_64/multiarch/strcmp-evex.S        | 34 +++++++++---------
 sysdeps/x86_64/multiarch/strcmp.c             |  9 ++++-
 sysdeps/x86_64/multiarch/strcpy-evex-tzcnt.S  |  7 ++++
 sysdeps/x86_64/multiarch/strcpy-evex.S        | 18 +++++-----
 sysdeps/x86_64/multiarch/strcpy.c             |  1 +
 sysdeps/x86_64/multiarch/strlen-evex-base.S   | 14 ++++----
 sysdeps/x86_64/multiarch/strlen-evex-tzcnt.S  | 10 ++++++
 sysdeps/x86_64/multiarch/strlen.c             |  1 +
 sysdeps/x86_64/multiarch/strncmp-evex-tzcnt.S |  6 ++++
 sysdeps/x86_64/multiarch/strncmp.c            |  9 ++++-
 sysdeps/x86_64/sysdep.h                       |  8 +++++
 25 files changed, 205 insertions(+), 60 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe-tzcnt.S
 create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-tzcnt.S
 create mode 100644 sysdeps/x86_64/multiarch/strcmp-evex-tzcnt.S
 create mode 100644 sysdeps/x86_64/multiarch/strcpy-evex-tzcnt.S
 create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-tzcnt.S
 create mode 100644 sysdeps/x86_64/multiarch/strncmp-evex-tzcnt.S
  

Patch

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index a453136827..6114065707 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -1129,6 +1129,11 @@  disable_tsx:
        hardware.  */
       cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
 	    &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
+      /* TZCNT performs better than BSF for string operations on Hygon
+	 processors.  */
+      cpu_features->preferred[index_arch_Prefer_TZCNT_for_stringop]
+	    |= bit_arch_Prefer_TZCNT_for_stringop;
       if (model < 0x4)
 	{
 	  /*  Unaligned AVX loads are slower.  */
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 74acb3fde1..8b0fe43703 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -21,6 +21,7 @@  BIT (Fast_Copy_Backward)
 BIT (Slow_BSF)
 BIT (Fast_Unaligned_Load)
 BIT (Prefer_PMINUB_for_stringop)
+BIT (Prefer_TZCNT_for_stringop)
 BIT (Fast_Unaligned_Copy)
 BIT (I586)
 BIT (I686)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 696cb66991..35dc92b877 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -10,6 +10,7 @@  sysdep_routines += \
   memcmp-avx2-movbe \
   memcmp-avx2-movbe-rtm \
   memcmp-evex-movbe \
+  memcmp-evex-movbe-tzcnt \
   memcmp-sse2 \
   memcmpeq-avx2 \
   memcmpeq-avx2-rtm \
@@ -62,6 +63,7 @@  sysdep_routines += \
   strchr-avx2 \
   strchr-avx2-rtm \
   strchr-evex \
+  strchr-evex-tzcnt \
   strchr-evex512 \
   strchr-sse2 \
   strchr-sse2-no-bsf \
@@ -73,18 +75,21 @@  sysdep_routines += \
   strcmp-avx2 \
   strcmp-avx2-rtm \
   strcmp-evex \
+  strcmp-evex-tzcnt \
   strcmp-sse2 \
   strcmp-sse2-unaligned \
   strcmp-sse4_2 \
   strcpy-avx2 \
   strcpy-avx2-rtm \
   strcpy-evex \
+  strcpy-evex-tzcnt \
   strcpy-sse2 \
   strcpy-sse2-unaligned \
   strcspn-sse4 \
   strlen-avx2 \
   strlen-avx2-rtm \
   strlen-evex \
+  strlen-evex-tzcnt \
   strlen-evex512 \
   strlen-sse2 \
   strncase_l-avx2 \
@@ -99,6 +104,7 @@  sysdep_routines += \
   strncmp-avx2 \
   strncmp-avx2-rtm \
   strncmp-evex \
+  strncmp-evex-tzcnt \
   strncmp-sse2 \
   strncmp-sse4_2 \
   strncpy-avx2 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
index 474466ba93..074e190df2 100644
--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
@@ -25,6 +25,10 @@ 
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
+#ifdef USE_EVEX_TZCNT
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_tzcnt) attribute_hidden;
+#endif
+
 #ifdef USE_EVEX512
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex512) attribute_hidden;
 #endif
@@ -49,6 +53,10 @@  IFUNC_SELECTOR (void)
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
       {
+#ifdef USE_EVEX_TZCNT
+	if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_TZCNT_for_stringop))
+	  return OPTIMIZE (evex_tzcnt);
+#endif
 #ifdef USE_EVEX512
         if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_EVEX512))
       return OPTIMIZE (evex512);
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 40d5a8819a..037ea9de64 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -99,6 +99,13 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI2)
 				      && CPU_FEATURE_USABLE (MOVBE)),
 				     __memcmp_evex_movbe)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, memcmp,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)
+				      && CPU_FEATURE_USABLE (MOVBE)),
+				     __memcmp_evex_movbe_tzcnt)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, memcmp,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)
@@ -374,6 +381,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __strlen_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, strlen,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __strlen_evex_tzcnt)
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, strlen,
 				     (CPU_FEATURE_USABLE (AVX512VL)
 				      && CPU_FEATURE_USABLE (AVX512BW)
@@ -544,6 +557,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __strchr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __strchr_evex_tzcnt)
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
 				     (CPU_FEATURE_USABLE (AVX512VL)
 				      && CPU_FEATURE_USABLE (AVX512BW)),
@@ -629,6 +648,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __strcmp_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, strcmp,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __strcmp_evex_tzcnt)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, strcmp,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
@@ -656,6 +681,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX512VL)
 				      && CPU_FEATURE_USABLE (AVX512BW)),
 				     __strcpy_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, strcpy,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)),
+				     __strcpy_evex_tzcnt)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, strcpy,
 				     CPU_FEATURE_USABLE (AVX2),
 				     __strcpy_avx2)
@@ -1318,6 +1348,12 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __strncmp_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, strncmp,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI1)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __strncmp_evex_tzcnt)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, strncmp,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index 7983763349..f9a5064787 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -20,6 +20,9 @@ 
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+#ifdef USE_EVEX_TZCNT
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe_tzcnt) attribute_hidden;
+#endif
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
@@ -39,7 +42,15 @@  IFUNC_SELECTOR (void)
     {
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
-	return OPTIMIZE (evex_movbe);
+	{
+#ifdef USE_EVEX_TZCNT
+	  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_TZCNT_for_stringop)
+	      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1))
+	    return OPTIMIZE (evex_movbe_tzcnt);
+#endif
+
+	  return OPTIMIZE (evex_movbe);
+	}
 
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 	return OPTIMIZE (avx2_movbe_rtm);
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 1498b4858a..bc60676ab9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -21,6 +21,9 @@ 
 #include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+#ifdef USE_EVEX_TZCNT
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_tzcnt) attribute_hidden;
+#endif
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -41,7 +44,15 @@  IFUNC_SELECTOR (void)
     {
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
-	return OPTIMIZE (evex);
+	{
+#ifdef USE_EVEX_TZCNT
+	  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_TZCNT_for_stringop)
+	      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1))
+	    return OPTIMIZE (evex_tzcnt);
+#endif
+
+	  return OPTIMIZE (evex);
+	}
 
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 	return OPTIMIZE (avx2_rtm);
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe-tzcnt.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe-tzcnt.S
new file mode 100644
index 0000000000..01d054c636
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe-tzcnt.S
@@ -0,0 +1,8 @@ 
+#ifndef MEMCMP
+# define MEMCMP __memcmp_evex_movbe_tzcnt
+#endif
+
+#define FIND_FIRST_SET(src, dst)	tzcnt src, dst
+#define FIND_FIRST_SETQ(src, dst)	tzcntq src, dst
+
+#include "memcmp-evex-movbe.S"
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
index 10c9e64262..44d24c9895 100644
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -138,7 +138,7 @@  ENTRY_P2ALIGN (MEMCMP, 6)
 
 	.p2align 4,, 11
 L(return_vec_0):
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 # ifdef USE_AS_WMEMCMP
 	movl	(%rdi, %rax, CHAR_SIZE), %ecx
 	xorl	%edx, %edx
@@ -267,10 +267,7 @@  L(return_vec_0_1_2_3):
 	jnz	L(return_vec_2)
 	.p2align 4,, 2
 L(return_vec_3):
-	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
-	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
-	   line.  */
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 # ifdef USE_AS_WMEMCMP
 	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
 	xorl	%edx, %edx
@@ -287,9 +284,7 @@  L(return_vec_3):
 
 	.p2align 4,, 8
 L(return_vec_1):
-	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
-	   fetch block.  */
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 # ifdef USE_AS_WMEMCMP
 	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
 	xorl	%edx, %edx
@@ -305,9 +300,7 @@  L(return_vec_1):
 
 	.p2align 4,, 7
 L(return_vec_2):
-	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
-	   fetch block.  */
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 # ifdef USE_AS_WMEMCMP
 	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
 	xorl	%edx, %edx
@@ -422,7 +415,7 @@  L(8x_last_1x_vec):
 L(8x_return_vec_2):
 	subq	$VEC_SIZE, %rdx
 L(8x_return_vec_3):
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 # ifdef USE_AS_WMEMCMP
 	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 	movl	(VEC_SIZE * 3)(%rax), %ecx
@@ -463,9 +456,7 @@  L(last_1x_vec):
 	 */
 	.p2align 4,, 3
 L(return_vec_1_end):
-	/* Use bsf to save code size. This is necessary to have
-	   L(one_or_less) fit in aligning bytes between.  */
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 	addl	%edx, %eax
 # ifdef USE_AS_WMEMCMP
 	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
@@ -484,7 +475,7 @@  L(return_vec_1_end):
 	/* Don't align. Takes 2-fetch blocks either way and aligning
 	   will cause code to spill into another cacheline.  */
 L(return_vec_0_end):
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 	addl	%edx, %eax
 # ifdef USE_AS_WMEMCMP
 	movl	-VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
diff --git a/sysdeps/x86_64/multiarch/memcmp.c b/sysdeps/x86_64/multiarch/memcmp.c
index 8dbb0f109a..73e0285241 100644
--- a/sysdeps/x86_64/multiarch/memcmp.c
+++ b/sysdeps/x86_64/multiarch/memcmp.c
@@ -24,6 +24,7 @@ 
 # undef memcmp
 
 # define SYMBOL_NAME memcmp
+# define USE_EVEX_TZCNT 1
 # include "ifunc-memcmp.h"
 
 libc_ifunc_redirected (__redirect_memcmp, memcmp, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strchr-evex-tzcnt.S b/sysdeps/x86_64/multiarch/strchr-evex-tzcnt.S
new file mode 100644
index 0000000000..9f181c9306
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-evex-tzcnt.S
@@ -0,0 +1,8 @@ 
+#ifndef STRCHR
+# define STRCHR	__strchr_evex_tzcnt
+#endif
+
+#define FIND_FIRST_SET(src, dst)	tzcnt src, dst
+#define FIND_FIRST_SETQ(src, dst)	tzcntq src, dst
+
+#include "strchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
index d82c96c36f..303bf6e25c 100644
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -116,12 +116,12 @@  ENTRY_P2ALIGN (STRCHR, 6)
 	/* If VEC_SIZE == 64 && STRCHRNUL use bsf to test condition so
 	   that all logic for match/null in first VEC first in 1x cache
 	   lines.  This has a slight cost to larger sizes.  */
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 	jz	L(aligned_more)
 # else
 	test	%VRAX, %VRAX
 	jz	L(aligned_more)
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 # endif
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.  */
@@ -161,7 +161,7 @@  L(last_vec_x2):
 L(first_vec_x1):
 	/* Use bsf here to save 1-byte keeping keeping the block in 1x
 	   fetch block. eax guaranteed non-zero.  */
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.  */
 	cmp	(VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %CHAR_REG
@@ -188,7 +188,7 @@  L(first_vec_x2):
 	/* Combine CHAR and null matches.  */
 	KOR	%k0, %k1, %k0
 	KMOV	%k0, %VRAX
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 # endif
 	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 	   bytes.  */
@@ -384,7 +384,7 @@  L(loop_4x_vec):
 	orq	%rcx, %rdx
 # endif
 
-	bsfq	%rdx, %rdx
+	FIND_FIRST_SETQ (%rdx, %rdx)
 
 # ifndef USE_AS_STRCHRNUL
 	/* Check if match was CHAR or null.  */
@@ -409,7 +409,7 @@  L(zero_end):
 # if VEC_SIZE == 64
 	.p2align 4,, 8
 L(last_vec_x1_vec_size64):
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 #  ifndef USE_AS_STRCHRNUL
 	/* Check if match was null.  */
 	cmp	(%rax, %rcx, CHAR_SIZE), %CHAR_REG
@@ -435,7 +435,7 @@  L(last_vec_x3):
 	/* Duplicate L(last_vec_x2) for VEC_SIZE == 64 because we can't
 	   reuse L(first_vec_x3) due to register mismatch.  */
 L(last_vec_x2):
-	bsf	%VGPR(MASK_GPR), %VGPR(MASK_GPR)
+	FIND_FIRST_SET (%VGPR(MASK_GPR), %VGPR(MASK_GPR))
 #  ifndef USE_AS_STRCHRNUL
 	/* Check if match was null.  */
 	cmp	(VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %CHAR_REG
@@ -493,7 +493,7 @@  L(cross_page_boundary_real):
 
 	.p2align 4,, 10
 L(last_vec_x1_vec_size32):
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 # ifdef USE_AS_WCSCHR
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
 	 */
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index f6bd36ba73..853a3833ff 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -27,6 +27,7 @@ 
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_tzcnt) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex512) attribute_hidden;
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
@@ -48,6 +49,10 @@  IFUNC_SELECTOR (void)
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 	{
+	  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_TZCNT_for_stringop)
+	      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1))
+	    return OPTIMIZE (evex_tzcnt);
+
 	  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_EVEX512))
 	    return OPTIMIZE (evex512);
 
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex-tzcnt.S b/sysdeps/x86_64/multiarch/strcmp-evex-tzcnt.S
new file mode 100644
index 0000000000..e2377e026e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-evex-tzcnt.S
@@ -0,0 +1,5 @@ 
+#define STRCMP_ISA	_evex_tzcnt
+#define FIND_FIRST_SET(src, dst)	tzcnt src, dst
+#define FIND_FIRST_SETQ(src, dst)	tzcntq src, dst
+
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index d87d33cc64..f259c36bb7 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -24,7 +24,9 @@ 
 #  include "x86-evex256-vecs.h"
 # endif
 
-# define STRCMP_ISA	_evex
+# ifndef STRCMP_ISA
+#  define STRCMP_ISA	_evex
+# endif
 # include "strcmp-naming.h"
 
 # include <sysdep.h>
@@ -335,7 +337,7 @@  L(no_page_cross):
 
 	.p2align 4,, 4
 L(return_vec_0):
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 # ifdef USE_AS_WCSCMP
 	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -406,7 +408,7 @@  L(ret1):
 
 	.p2align 4,, 10
 L(return_vec_1):
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 # ifdef USE_AS_STRNCMP
 	/* rdx must be > CHAR_PER_VEC so its safe to subtract without
 	   worrying about underflow.  */
@@ -447,7 +449,7 @@  L(return_vec_3):
 #  else
 	/* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
 	   check it.  */
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 	addl	$(CHAR_PER_VEC), %ecx
 	cmpq	%rcx, %rdx
 	ja	L(ret_vec_3_finish)
@@ -460,9 +462,9 @@  L(return_vec_3):
 	   2x VEC so need separate return label.  */
 L(return_vec_2):
 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 # else
-	bsfq	%rcx, %rcx
+	FIND_FIRST_SETQ (%rcx, %rcx)
 # endif
 # ifdef USE_AS_STRNCMP
 	cmpq	%rcx, %rdx
@@ -491,7 +493,7 @@  L(ret3):
 # ifndef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(return_vec_3):
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 #  ifdef USE_AS_WCSCMP
 	movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -703,9 +705,9 @@  L(return_vec_2_3_end):
 	   mismatch must entirely be from VEC 3 which is fully
 	   represented by LOOP_REG.  */
 # if CHAR_PER_VEC <= 16
-	bsf	%LOOP_REG, %LOOP_REG
+	FIND_FIRST_SET (%LOOP_REG, %LOOP_REG)
 # else
-	bsfq	%LOOP_REG64, %LOOP_REG64
+	FIND_FIRST_SETQ (%LOOP_REG64, %LOOP_REG64)
 # endif
 # ifdef USE_AS_STRNCMP
 
@@ -768,7 +770,7 @@  L(return_vec_1_end):
 #  else
 	/* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
 	   check it.  */
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 	addl	$(CHAR_PER_VEC), %ecx
 	cmpq	%rcx, %rdx
 	ja	L(ret_vec_0_end_finish)
@@ -778,9 +780,9 @@  L(return_vec_1_end):
 # endif
 L(return_vec_0_end):
 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 # else
-	bsfq	%rcx, %rcx
+	FIND_FIRST_SETQ (%rcx, %rcx)
 # endif
 
 # ifdef USE_AS_STRNCMP
@@ -816,7 +818,7 @@  L(ret6):
 # ifndef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(return_vec_1_end):
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 #  ifdef USE_AS_WCSCMP
 	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -843,7 +845,7 @@  L(ret7):
 	   2x VEC so need separate return label.  */
 # if CHAR_PER_VEC == 64
 L(return_vec_2_end):
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 #  ifdef USE_AS_STRNCMP
 	cmpq	%rcx, %rdx
 	jbe	L(ret_zero_end)
@@ -963,7 +965,7 @@  L(less_1x_vec_till_page_cross):
 	and	%VR10, %VRCX
 	jz	L(loop_skip_page_cross_check)
 
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 
 # if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
 	leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
@@ -1104,7 +1106,7 @@  L(ret_zero_in_loop_page_cross):
 L(return_vec_page_cross_0):
 	addl	$-VEC_SIZE, %eax
 L(return_vec_page_cross_1):
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
 	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
 #  ifdef USE_AS_STRNCMP
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index ba6abf7dde..94efee00ee 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -27,6 +27,7 @@ 
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_tzcnt) attribute_hidden;
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -51,7 +52,13 @@  IFUNC_SELECTOR (void)
     {
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
-	return OPTIMIZE (evex);
+	{
+	  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_TZCNT_for_stringop)
+	      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1))
+	    return OPTIMIZE (evex_tzcnt);
+
+	  return OPTIMIZE (evex);
+	}
 
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 	return OPTIMIZE (avx2_rtm);
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex-tzcnt.S b/sysdeps/x86_64/multiarch/strcpy-evex-tzcnt.S
new file mode 100644
index 0000000000..09d9a90d83
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-evex-tzcnt.S
@@ -0,0 +1,7 @@ 
+#ifndef STRCPY
+# define STRCPY	__strcpy_evex_tzcnt
+#endif
+
+#define FIND_FIRST_SET(src, dst)	tzcnt src, dst
+
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
index a0d4b9e8cd..891e355a88 100644
--- a/sysdeps/x86_64/multiarch/strcpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -121,7 +121,7 @@  L(page_cross_continue):
 	VMOVU_MASK %VMM(0), (%rdi){%k1}
 
 #  ifdef USE_AS_STPCPY
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
 #  endif
 	ret
@@ -133,7 +133,7 @@  L(page_cross_continue):
 	jz	L(more_1x_vec)
 
 	xorl	%edx, %edx
-	bsf	%VRCX, %VRDX
+	FIND_FIRST_SET (%VRCX, %VRDX)
 #  ifdef USE_AS_STPCPY
 	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
 #  endif
@@ -351,7 +351,7 @@  L(loop_4x_done):
 	   meaningfuly benefit doing this for stpcpy.  */
 	KMOV	%k4, %VRDX
 L(ret_vec_x3):
-	bsf	%VRDX, %VRDX
+	FIND_FIRST_SET (%VRDX, %VRDX)
 	VMOVU	((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
 	VMOVU	%VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
@@ -362,7 +362,7 @@  L(return_end):
 
 	.p2align 4,, 6
 L(ret_vec_x0_end):
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 # ifdef USE_AS_STPCPY
 	leaq	(%rdi, %rcx, CHAR_SIZE), %rax
 # endif
@@ -373,7 +373,7 @@  L(ret_vec_x0_end):
 
 	.p2align 4,, 8
 L(ret_vec_x1):
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 	VMOVU	(VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
 	VMOVU	%VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
@@ -383,7 +383,7 @@  L(ret_vec_x1):
 
 	.p2align 4,, 4
 L(ret_vec_x2):
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 	VMOVU	((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
 	VMOVU	%VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
@@ -394,7 +394,7 @@  L(ret_vec_x2):
 	/* ret_vec_x3 reuses return code after the loop.  */
 	.p2align 4,, 6
 L(ret_vec_x4):
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 	VMOVU	((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
 	VMOVU	%VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
 # ifdef USE_AS_STPCPY
@@ -432,7 +432,7 @@  L(page_cross):
 #  if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
 	movq	%rdi, %rax
 #  endif
-	bsf	%VRCX, %VRCX
+	FIND_FIRST_SET (%VRCX, %VRCX)
 	REP_MOVS
 
 #  ifdef USE_AS_STPCPY
@@ -454,7 +454,7 @@  L(page_cross):
 	xorl	%edx, %edx
 #  endif
 	/* Dependency on rdi must already have been satisfied.  */
-	bsf	%VRCX, %VRDX
+	FIND_FIRST_SET (%VRCX, %VRDX)
 #  ifdef USE_AS_STPCPY
 	leaq	(%rdi, %rdx, CHAR_SIZE), %rax
 #  elif !defined USE_AS_STRCAT
diff --git a/sysdeps/x86_64/multiarch/strcpy.c b/sysdeps/x86_64/multiarch/strcpy.c
index cf5bd7c08d..520cf737ea 100644
--- a/sysdeps/x86_64/multiarch/strcpy.c
+++ b/sysdeps/x86_64/multiarch/strcpy.c
@@ -24,6 +24,7 @@ 
 # undef strcpy
 
 # define SYMBOL_NAME strcpy
+# define USE_EVEX_TZCNT 1
 # include "ifunc-strcpy.h"
 
 libc_ifunc_redirected (__redirect_strcpy, strcpy, IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 25db695770..74bd752db5 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -82,12 +82,12 @@  ENTRY_P2ALIGN(STRLEN, 6)
 	KMOV	%k0, %VRAX
 	test	%VRAX, %VRAX
 	jz	L(aligned_more)
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 	ret
 
 	.p2align 4,, 8
 L(first_vec_x4):
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 	subl	%ecx, %edi
 	CHAR_SIZE_SHIFT_REG (edi)
 	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
@@ -178,7 +178,7 @@  L(loop_4x_vec):
 	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.  */
 	.p2align 4,, 2
 L(FALLTHROUGH_RETURN_LBL):
-	bsfq	%rax, %rax
+	FIND_FIRST_SETQ (%rax, %rax)
 	subq	%rcx, %rdi
 	CHAR_SIZE_SHIFT_REG (rdi)
 	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
@@ -186,7 +186,7 @@  L(FALLTHROUGH_RETURN_LBL):
 
 	.p2align 4,, 8
 L(first_vec_x0):
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 	sub	%rcx, %rdi
 	CHAR_SIZE_SHIFT_REG (rdi)
 	addq	%rdi, %rax
@@ -194,7 +194,7 @@  L(first_vec_x0):
 
 	.p2align 4,, 10
 L(first_vec_x1):
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 	sub	%rcx, %rdi
 	CHAR_SIZE_SHIFT_REG (rdi)
 	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
@@ -203,7 +203,7 @@  L(first_vec_x1):
 	.p2align 4,, 10
 	/* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.  */
 L(TAIL_RETURN_LBL):
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 	sub	%VRCX, %VRDI
 	CHAR_SIZE_SHIFT_REG (VRDI)
 	lea	(TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
@@ -228,7 +228,7 @@  L(cross_page_boundary):
 	shr	%cl, %VRAX
 # endif
 	jz	L(cross_page_continue)
-	bsf	%VRAX, %VRAX
+	FIND_FIRST_SET (%VRAX, %VRAX)
 	ret
 
 END(STRLEN)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex-tzcnt.S b/sysdeps/x86_64/multiarch/strlen-evex-tzcnt.S
new file mode 100644
index 0000000000..f99272232e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex-tzcnt.S
@@ -0,0 +1,10 @@ 
+#ifndef STRLEN
+# define STRLEN		__strlen_evex_tzcnt
+#endif
+
+#define FIND_FIRST_SET(src, dst)	tzcnt src, dst
+#define FIND_FIRST_SETQ(src, dst)	tzcntq src, dst
+
+#include "x86-evex256-vecs.h"
+#include "reg-macros.h"
+#include "strlen-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strlen.c b/sysdeps/x86_64/multiarch/strlen.c
index 2f93abf1c8..05ed6e9c1b 100644
--- a/sysdeps/x86_64/multiarch/strlen.c
+++ b/sysdeps/x86_64/multiarch/strlen.c
@@ -24,6 +24,7 @@ 
 # undef strlen
 
 # define SYMBOL_NAME strlen
+# define USE_EVEX_TZCNT 1
 # define USE_EVEX512 1
 # include "ifunc-avx2.h"
 
diff --git a/sysdeps/x86_64/multiarch/strncmp-evex-tzcnt.S b/sysdeps/x86_64/multiarch/strncmp-evex-tzcnt.S
new file mode 100644
index 0000000000..b9ea0a2d3d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-evex-tzcnt.S
@@ -0,0 +1,6 @@ 
+#define STRCMP_ISA	_evex_tzcnt
+#define FIND_FIRST_SET(src, dst)	tzcnt src, dst
+#define FIND_FIRST_SETQ(src, dst)	tzcntq src, dst
+#define USE_AS_STRNCMP 1
+
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index 02863eeb71..f46f37e05b 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -27,6 +27,7 @@ 
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_tzcnt) attribute_hidden;
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -47,7 +48,13 @@  IFUNC_SELECTOR (void)
     {
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
-	return OPTIMIZE (evex);
+	{
+	  if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_TZCNT_for_stringop)
+	      && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI1))
+	    return OPTIMIZE (evex_tzcnt);
+
+	  return OPTIMIZE (evex);
+	}
 
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 	return OPTIMIZE (avx2_rtm);
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index f71f49213f..b7652b939e 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -175,6 +175,14 @@  lose:									      \
 # define VZEROUPPER_RETURN	VZEROUPPER; ret
 #endif
 
+#ifndef FIND_FIRST_SET
+# define FIND_FIRST_SET(src, dst)	bsf src, dst
+#endif
+
+#ifndef FIND_FIRST_SETQ
+# define FIND_FIRST_SETQ(src, dst)	bsfq src, dst
+#endif
+
 #else	/* __ASSEMBLER__ */
 
 /* Instruction to operate on long and pointer.  */