[neleai/string-x64] Optimize strncmp with unaligned loads.

Message ID 20150620174804.GA20938@domone
State New, archived
Headers

Commit Message

Ondrej Bilka June 20, 2015, 5:48 p.m. UTC
  Hi,

This uses refactored strcmp to add strncmp functionality. 

Basic idea is simple, at start initialize register with 1<<(n-1) when n
<64 on unaligned header and or that with masks. From what I tried that
gives smallest slowdown versus strcmp.
For larger sizes I modify counter when we need to do unaligned load
cross page to also trigger when we need inspect less than 64 characters.

Performance with these tricks is nearly identical as strcmp.

OK to add this?

Profile graphs are here:

http://kam.mff.cuni.cz/~ondra/benchmark_string/strncmp_profile.html

	* sysdeps/x86_64/multiarch/Makefile (routines): Add strncmp-avx2.S and
	strncmp-sse2-unaligned.S.
	* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add
	__strncmp_sse2_unaligned and __strncmp_avx2.
	* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Add strncmp
	functionality.
	* sysdeps/x86_64/multiarch/strcmp.S: Adjust ifunc.
	* sysdeps/x86_64/multiarch/strncmp-avx2.S: New file.
	* sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S: Likewise.

---
 sysdeps/x86_64/multiarch/Makefile                 |  4 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c        |  5 +-
 sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S  | 78 +++++++++++++++++++++--
 sysdeps/x86_64/multiarch/strcmp.S                 | 37 ++++++-----
 sysdeps/x86_64/multiarch/strncmp-avx2.S           |  4 ++
 sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S |  3 +
 6 files changed, 104 insertions(+), 27 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strncmp-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S
  

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index bf48283..95e0190 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,7 +7,7 @@  endif
 ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
-		   strcmp-sse2-unaligned strncmp-ssse3 \
+		   strcmp-sse2-unaligned strncmp-sse2-unaligned strncmp-ssse3 \
 		   memcpy-ssse3 \
 		   memcpy-sse2-unaligned mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
@@ -30,7 +30,7 @@  CFLAGS-strspn-c.c += -msse4
 endif
 
 ifeq (yes,$(config-cflags-avx2))
-sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2  strcmp-avx2
+sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2  strcmp-avx2 strncmp-avx2
 endif
 endif
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 57ce237..51ff3ed 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -257,8 +257,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
   IFUNC_IMPL (i, name, strncmp,
-	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2,
-			      __strncmp_sse42)
+ 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_AVX2, __strncmp_avx2)
+
 	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSSE3,
 			      __strncmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index 8258eb8..f3a0508 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -19,6 +19,14 @@ 
 #include "sysdep.h"
 
 ENTRY ( __strcmp_sse2_unaligned)
+#ifdef AS_STRNCMP
+	lea	-1(%rdx), %r10
+	test	%rdx, %rdx
+	je	L(ret_zero)
+L(back_to_start):
+	xor	%rdx, %rdx
+#endif
+
 	pxor	%xmm7, %xmm7
 	movl	%esi, %eax
 	andl	$4095, %eax
@@ -29,20 +37,35 @@  ENTRY ( __strcmp_sse2_unaligned)
 	andl	$4095, %eax
 	cmpl	$4032, %eax
 	jg	L(cross_page)
+#ifdef AS_STRNCMP
+	cmp	$64, %r10
+	jae	L(dont_set_mask)
+	bts	%r10, %rdx
+L(dont_set_mask):
+#endif
+
 	movdqu	(%rdi), %xmm1
 	movdqu	(%rsi), %xmm0
 	pcmpeqb	%xmm1, %xmm0
 	pminub	%xmm1, %xmm0
 	pcmpeqb	%xmm7, %xmm0
 	pmovmskb %xmm0, %eax
+#ifdef AS_STRNCMP
+	or	%dx, %ax
+#else
 	test	%eax, %eax
+#endif
 	je	L(next_48_bytes)
 	bsf	%eax, %edx
 	movzbl	(%rdi, %rdx), %eax
 	movzbl	(%rsi, %rdx), %edx
 	subl	%edx, %eax
 	ret
-
+#ifdef AS_STRNCMP
+	L(ret_zero):
+	xor	%eax, %eax
+	ret
+#endif
 	.p2align 4
 L(next_48_bytes):
 	movdqu	16(%rdi), %xmm6
@@ -54,16 +77,19 @@  L(next_48_bytes):
 	pcmpeqb	%xmm7, %xmm3
 	movdqu	48(%rdi), %xmm4
 	pcmpeqb	%xmm5, %xmm2
-	pmovmskb %xmm3, %edx
 	movdqu	48(%rsi), %xmm0
 	pminub	%xmm5, %xmm2
 	pcmpeqb	%xmm7, %xmm2
 	pcmpeqb	%xmm4, %xmm0
 	pmovmskb %xmm2, %eax
+	salq	$32, %rax
+#ifdef AS_STRNCMP
+	or	%rdx, %rax
+#endif
+	pmovmskb %xmm3, %edx
 	sal	$16, %edx
 	pminub	%xmm4, %xmm0
 	pcmpeqb	%xmm7, %xmm0
-	salq	$32, %rax
 	orq	%rdx, %rax
 	pmovmskb %xmm0, %ecx
 	salq	$48, %rcx
@@ -82,6 +108,10 @@  L(main_loop_header):
 #endif
 	leaq	64(%rdi), %rdx
 	andq	$-64, %rdx
+# ifdef AS_STRNCMP
+	addq	%rdi, %r10
+	subq	%rdx, %r10
+# endif
 	subq	%rdi, %rdx
 	leaq	(%rdi, %rdx), %rax
 	addq	%rsi, %rdx
@@ -90,6 +120,15 @@  L(main_loop_header):
 	andl	$4095, %ecx
 	sub	%ecx, %esi
 	shr	$6, %esi
+#ifdef AS_STRNCMP
+	mov	%r10, %r9
+	addq	%rdx, %r10
+	shr	$6, %r9
+	cmp	%r9, %rsi
+	jb	L(dont_set_page_bound)
+	mov	%r9, %rsi
+L(dont_set_page_bound):
+#endif
 
 	.p2align 4
 L(loop):
@@ -111,7 +150,7 @@  L(back_to_loop):
 	addq	$64, %rdx
 	vpmovmskb %ymm2, %edi
 	test	%edi, %edi
-	je	.Lloop
+	je	L(loop)
 	shl	$32, %rdi
 	vpcmpeqb %ymm7, %ymm0, %ymm0
 	vpmovmskb %ymm0, %ecx
@@ -164,6 +203,14 @@  L(back_to_loop):
 
 	.p2align 4
 L(loop_cross_page):
+#ifdef AS_STRNCMP
+	mov	%r10, %r9
+	sub	%rdx, %r9
+	cmp	$64, %r9
+	jb	L(prepare_back_to_start)
+#endif
+
+
 	mov	%edx, %ecx
 	and	$63, %ecx
 	neg	%rcx
@@ -219,6 +266,14 @@  L(loop_cross_page):
 #endif
 	mov	%edx, %ecx
 	mov	$63, %esi
+#ifdef AS_STRNCMP
+	shr	$6, %r9
+	sub	$1, %r9
+	cmp	%r9, %rsi
+	jb	L(dont_set_bound2)
+	mov	%r9, %rsi
+L(dont_set_bound2):
+#endif
 	shrq	%cl, %rdi
 	test	%rdi, %rdi
 	je	L(back_to_loop)
@@ -231,6 +286,18 @@  L(loop_cross_page):
 	subl	%edx, %eax
 	ret
 
+#ifdef AS_STRNCMP
+L(prepare_back_to_start):
+# ifdef USE_AVX2
+	vzeroupper
+# endif
+	mov	%r9, %r10
+	mov	%rdx, %rsi
+	mov	%rax, %rdi
+	jmp	L(back_to_start)
+#endif
+
+
 L(cross_page):
 	xorl	%edx, %edx
 	jmp	L(cross_page_loop_start)
@@ -244,6 +311,9 @@  L(cross_page_loop_start):
 	movzbl	(%rsi, %rdx), %ecx
 	subl	%ecx, %eax
 	jne	L(different)
+	cmp	%rdx, %r10
+	je	L(different)
+
 	test	%ecx, %ecx
 	jne	L(cross_page_loop)
 L(different):
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index 867e9d4..02d22d1 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -31,8 +31,8 @@ 
 	test	%r9, %r9;				\
 	je	LABEL(strcmp_exitz);			\
 	mov	%r9, %r11
-
-# define STRCMP_SSE42	__strncmp_sse42
+# define STRCMP_AVX2	__strncmp_avx2
+# define STRCMP_SSE2_UNALIGNED	__strncmp_sse2_unaligned
 # define STRCMP_SSSE3	__strncmp_ssse3
 # define STRCMP_SSE2	__strncmp_sse2
 # define __GI_STRCMP	__GI_strncmp
@@ -69,8 +69,9 @@ 
 # define USE_AS_STRCMP
 # define UPDATE_STRNCMP_COUNTER
 # ifndef STRCMP
+#  define STRCMP_AVX2	__strcmp_avx2
+#  define STRCMP_SSE2_UNALIGNED	__strcmp_sse2_unaligned
 #  define STRCMP	strcmp
-#  define STRCMP_SSE42	__strcmp_sse42
 #  define STRCMP_SSSE3	__strcmp_ssse3
 #  define STRCMP_SSE2	__strcmp_sse2
 #  define __GI_STRCMP	__GI_strcmp
@@ -89,23 +90,23 @@  ENTRY(STRCMP)
 	jne	1f
 	call	__init_cpu_features
 1:
-#ifdef USE_AS_STRCMP
-# ifdef HAVE_AVX2_SUPPORT
+# if defined (USE_AS_STRCMP) || defined (USE_AS_STRNCMP)
+#  ifdef HAVE_AVX2_SUPPORT
 
-        leaq    __strcmp_avx2(%rip), %rax
+        leaq    STRCMP_AVX2(%rip), %rax
         testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
 	jnz	3f
-# endif
-	leaq	__strcmp_sse2_unaligned(%rip), %rax
+#  endif
+	leaq	STRCMP_SSE2_UNALIGNED(%rip), %rax
 	testl   $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
 	jnz     3f
-#else
+# else
 	testl	$bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip)
 	jnz	2f
 	leaq	STRCMP_SSE42(%rip), %rax
 	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
 	jnz	3f
-#endif
+# endif
 2:	leaq	STRCMP_SSSE3(%rip), %rax
 	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
 	jnz	3f
@@ -166,15 +167,13 @@  END(__strncasecmp)
 weak_alias (__strncasecmp, strncasecmp)
 # endif
 
-# undef LABEL
-# define LABEL(l) .L##l##_sse42
-# define GLABEL(l) l##_sse42
-# define SECTION sse4.2
-# include "strcmp-sse42.S"
-
-
-# ifdef HAVE_AVX_SUPPORT
-#  if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#  undef LABEL
+#  define LABEL(l) .L##l##_sse42
+#  define GLABEL(l) l##_sse42
+#  define SECTION sse4.2
+#  include "strcmp-sse42.S"
+#  ifdef HAVE_AVX_SUPPORT
 #   define LABEL(l) .L##l##_avx
 #   define GLABEL(l) l##_avx
 #   define USE_AVX 1
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S
new file mode 100644
index 0000000..fe70abd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S
@@ -0,0 +1,4 @@ 
+#define USE_AVX2
+#define AS_STRNCMP
+#define __strcmp_sse2_unaligned __strncmp_avx2
+#include "strcmp-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S
new file mode 100644
index 0000000..d987b28
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S
@@ -0,0 +1,3 @@ 
+#define AS_STRNCMP
+#define __strcmp_sse2_unaligned __strncmp_sse2_unaligned
+#include "strcmp-sse2-unaligned.S"