[neleai/string-x64] Microoptimize strcmp-sse2-unaligned.

Message ID 20150620083525.GA31992@domone
State New, archived
Headers

Commit Message

Ondrej Bilka June 20, 2015, 8:35 a.m. UTC
  Hi,

When I read strcmp again to improve strncmp and add avx2 strcmp 
I found that I made several mistakes, mainly caused by first optimizing 
c template and then fixing assembly.

First was mainly my idea to simplify handling cross-page check by oring
src and dest. I recall that I first did complex crosspage handling where
false positives were cheap. Then I found that due to size it has big
overhead and simple loop was faster when testing with firefox. 
That turned original decision into bad one.

Second is to reorganize loop instructions so that after loop ends I could 
simply find last byte without recalculating much, using trick that last
16 bit mask could be ored with previous three as its relevant only when
previous three were zero.

Final one is that gcc generates bad loops in regards where to increment
pointers. You should place them after loads that use them, not at start
of loop like gcc does. That change is responsible for 10% improvement
for large sizes.

Final are microoptimizations that save few bytes without measurable
performance impact like using eax instead rax to save byte or moving
unnecessary zeroing instruction when they are not needed.

Profile data are here, shortly with avx2 for haswell that I will submit
next.

http://kam.mff.cuni.cz/~ondra/benchmark_string/strcmp_profile.html

OK to commit this?

	* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
	(__strcmp_sse2_unaligned): Add several microoptimizations.
  

Comments

Ondrej Bilka June 20, 2015, 11:16 a.m. UTC | #1
I updated page above to contain avx2 data, profiler used is here.
http://kam.mff.cuni.cz/~ondra/benchmark_string/strcmp_profile200615.tar.bz2

I also found that there is possible regression on core2, see.
http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strcmp_profile/results_gcc/result.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strcmp_profile/results_rand/result.html

Problem is that for larger sizes a ssse3 is still faster but overall its
worse as due to high startup cost. Also big instruction cache footprint
is problem. When in icache its benefical for strings larger than 128
bytes, when its cold then icache causes jump that point into ~400 bytes

http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strcmp_profile/results_rand_noicache/result.html

So how much we care about core2? I know several functions that could be
improved but I put it in backlog due that it isn't that important for me
to optimize and also its nontrivial to do switch without harming smaller
sizes that are hot path instead this cold one.

So how proceed?
  

Patch

diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index 20b65fa..03d1b11 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -19,10 +19,13 @@ 
 #include "sysdep.h"
 
 ENTRY ( __strcmp_sse2_unaligned)
-	movl	%edi, %eax
-	xorl	%edx, %edx
 	pxor	%xmm7, %xmm7
-	orl	%esi, %eax
+	movl	%esi, %eax
+	andl	$4095, %eax
+	cmpl	$4032, %eax
+	jg	L(cross_page)
+
+	movl	%edi, %eax
 	andl	$4095, %eax
 	cmpl	$4032, %eax
 	jg	L(cross_page)
@@ -30,13 +33,11 @@  ENTRY ( __strcmp_sse2_unaligned)
 	movdqu	(%rsi), %xmm0
 	pcmpeqb	%xmm1, %xmm0
 	pminub	%xmm1, %xmm0
-	pxor	%xmm1, %xmm1
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	testq	%rax, %rax
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
 	je	L(next_48_bytes)
-L(return):
-	bsfq	%rax, %rdx
+	bsf	%eax, %edx
 	movzbl	(%rdi, %rdx), %eax
 	movzbl	(%rsi, %rdx), %edx
 	subl	%edx, %eax
@@ -50,29 +51,35 @@  L(next_48_bytes):
 	pcmpeqb	%xmm6, %xmm3
 	movdqu	32(%rsi), %xmm2
 	pminub	%xmm6, %xmm3
-	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm7, %xmm3
 	movdqu	48(%rdi), %xmm4
 	pcmpeqb	%xmm5, %xmm2
-	pmovmskb	%xmm3, %edx
+	pmovmskb %xmm3, %edx
 	movdqu	48(%rsi), %xmm0
 	pminub	%xmm5, %xmm2
-	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm7, %xmm2
 	pcmpeqb	%xmm4, %xmm0
-	pmovmskb	%xmm2, %eax
-	salq	$16, %rdx
+	pmovmskb %xmm2, %eax
+	sal	$16, %edx
 	pminub	%xmm4, %xmm0
-	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm7, %xmm0
 	salq	$32, %rax
 	orq	%rdx, %rax
-	pmovmskb	%xmm0, %ecx
-	movq	%rcx, %rdx
-	salq	$48, %rdx
-	orq	%rdx, %rax
-	jne	L(return)
+	pmovmskb %xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rcx, %rax
+	je	L(main_loop_header)
+L(return):
+	bsf	%rax, %rdx
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	subl	%edx, %eax
+	ret
+
+
 L(main_loop_header):
 	leaq	64(%rdi), %rdx
 	movl	$4096, %ecx
-	pxor	%xmm9, %xmm9
 	andq	$-64, %rdx
 	subq	%rdi, %rdx
 	leaq	(%rdi, %rdx), %rax
@@ -82,16 +89,11 @@  L(main_loop_header):
 	subq	%rsi, %rcx
 	shrq	$6, %rcx
 	movq	%rcx, %rsi
-	jmp	L(loop_start)
 
 	.p2align 4
 L(loop):
-	addq	$64, %rax
-	addq	$64, %rdx
-L(loop_start):
-	testq	%rsi, %rsi
-	leaq	-1(%rsi), %rsi
-	je	L(loop_cross_page)
+	add	$-1, %rsi
+	ja	L(loop_cross_page)
 L(back_to_loop):
 	movdqu	(%rdx), %xmm0
 	movdqu	16(%rdx), %xmm1
@@ -104,61 +106,57 @@  L(back_to_loop):
 	movdqu	48(%rdx), %xmm6
 	pminub	%xmm3, %xmm1
 	movdqa	32(%rax), %xmm2
-	pminub	%xmm1, %xmm0
 	movdqa	48(%rax), %xmm3
 	pcmpeqb	%xmm2, %xmm5
 	pcmpeqb	%xmm3, %xmm6
+	addq	$64, %rax
 	pminub	%xmm2, %xmm5
 	pminub	%xmm3, %xmm6
-	pminub	%xmm5, %xmm0
-	pminub	%xmm6, %xmm0
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb	%xmm0, %ecx
+	addq	$64, %rdx
+	pminub	%xmm5, %xmm6
+	pminub	%xmm1, %xmm6
+	pminub	%xmm0, %xmm6
+	pcmpeqb	%xmm7, %xmm6
+	pmovmskb %xmm6, %ecx
 	testl	%ecx, %ecx
 	je	L(loop)
-	pcmpeqb	%xmm7, %xmm5
-	movdqu	(%rdx), %xmm0
-	pcmpeqb	%xmm7, %xmm1
-	movdqa	(%rax), %xmm2
-	pcmpeqb	%xmm2, %xmm0
-	pminub	%xmm2, %xmm0
-	pcmpeqb	%xmm7, %xmm6
 	pcmpeqb	%xmm7, %xmm0
-	pmovmskb	%xmm1, %ecx
-	pmovmskb	%xmm5, %r8d
-	pmovmskb	%xmm0, %edi
-	salq	$16, %rcx
+	pcmpeqb	%xmm7, %xmm1
+	pcmpeqb	%xmm7, %xmm5
+	pmovmskb %xmm0, %edi
+	pmovmskb %xmm1, %esi
+	pmovmskb %xmm5, %r8d
+	salq	$48, %rcx
 	salq	$32, %r8
-	pmovmskb	%xmm6, %esi
 	orq	%r8, %rcx
 	orq	%rdi, %rcx
-	salq	$48, %rsi
+	sal	$16, %esi
 	orq	%rsi, %rcx
 	bsfq	%rcx, %rcx
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
+	movzbl	-64(%rax, %rcx), %eax
+	movzbl	-64(%rdx, %rcx), %edx
 	subl	%edx, %eax
 	ret
 
 	.p2align 4
 L(loop_cross_page):
-	xor	%r10, %r10
+	xor	%ecx, %ecx
 	movq	%rdx, %r9
 	and	$63, %r9
-	subq	%r9, %r10
+	subq	%r9, %rcx
 
-	movdqa	(%rdx, %r10), %xmm0
-	movdqa	16(%rdx, %r10), %xmm1
-	movdqu	(%rax, %r10), %xmm2
-	movdqu	16(%rax, %r10), %xmm3
+	movdqa	(%rdx, %rcx), %xmm0
+	movdqa	16(%rdx, %rcx), %xmm1
+	movdqu	(%rax, %rcx), %xmm2
+	movdqu	16(%rax, %rcx), %xmm3
 	pcmpeqb	%xmm2, %xmm0
-	movdqa	32(%rdx, %r10), %xmm5
+	movdqa	32(%rdx, %rcx), %xmm5
 	pcmpeqb	%xmm3, %xmm1
 	pminub	%xmm2, %xmm0
-	movdqa	48(%rdx, %r10), %xmm6
+	movdqa	48(%rdx, %rcx), %xmm6
 	pminub	%xmm3, %xmm1
-	movdqu	32(%rax, %r10), %xmm2
-	movdqu	48(%rax, %r10), %xmm3
+	movdqu	32(%rax, %rcx), %xmm2
+	movdqu	48(%rax, %rcx), %xmm3
 	pcmpeqb	%xmm2, %xmm5
 	pcmpeqb	%xmm3, %xmm6
 	pminub	%xmm2, %xmm5
@@ -169,12 +167,12 @@  L(loop_cross_page):
 	pcmpeqb	%xmm7, %xmm5
 	pcmpeqb	%xmm7, %xmm6
 
-	pmovmskb	%xmm1, %ecx
-	pmovmskb	%xmm5, %r8d
-	pmovmskb	%xmm0, %edi
-	salq	$16, %rcx
+	pmovmskb %xmm1, %ecx
+	pmovmskb %xmm5, %r8d
+	pmovmskb %xmm0, %edi
+	sal	$16, %ecx
 	salq	$32, %r8
-	pmovmskb	%xmm6, %esi
+	pmovmskb %xmm6, %esi
 	orq	%r8, %rdi
 	orq	%rcx, %rdi
 	salq	$48, %rsi
@@ -190,20 +188,21 @@  L(loop_cross_page):
 	subl	%edx, %eax
 	ret
 
+L(cross_page):
+	xorl	%edx, %edx
+	jmp	L(cross_page_loop_start)
 	.p2align 4
 L(cross_page_loop):
-	cmpb	%cl, %al
-	jne	L(different)
-	addq	$1, %rdx
-	cmpq	$64, %rdx
+	add	$1, %edx
+	cmp	$64, %edx
 	je	L(main_loop_header)
-L(cross_page):
+L(cross_page_loop_start):
 	movzbl	(%rdi, %rdx), %eax
 	movzbl	(%rsi, %rdx), %ecx
-	testb	%al, %al
+	subl	%ecx, %eax
+	jne	L(different)
+	test	%ecx, %ecx
 	jne	L(cross_page_loop)
-	xorl	%eax, %eax
 L(different):
-	subl	%ecx, %eax
 	ret
 END (__strcmp_sse2_unaligned)