From patchwork Sat Jun 20 11:15:20 2015
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Patchwork-Submitter: Ondrej Bilka <neleai@seznam.cz>
X-Patchwork-Id: 7269
Received: (qmail 85844 invoked by alias); 20 Jun 2015 11:15:34 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 85831 invoked by uid 89); 20 Jun 2015 11:15:33 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=0.7 required=5.0 tests=AWL, BAYES_50,
	FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2
X-HELO: popelka.ms.mff.cuni.cz
Date: Sat, 20 Jun 2015 13:15:20 +0200
From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz>
To: libc-alpha@sourceware.org
Subject: Re: [PATCH 2/2 neleai/string-x64] Add strcmp with avx2
Message-ID: <20150620111520.GA12420@domone>
References: <20150620083525.GA31992@domone>
MIME-Version: 1.0
Content-Disposition: inline
In-Reply-To: <20150620083525.GA31992@domone>
User-Agent: Mutt/1.5.20 (2009-06-14)

On Sat, Jun 20, 2015 at 10:35:25AM +0200, Ondřej Bílka wrote:
> 
> Hi,
> 
> When I read strcmp again to improve strncmp and add avx2 strcmp 
> I found that I made several mistakes, mainly caused by first optimizing 
> c template and then fixing assembly.
> 
> First was mainly my idea to simplify handling cross-page check by oring
> src and dest. I recall that I first did complex crosspage handling where
> false positives were cheap. Then I found that due to size it has big
> overhead and simple loop was faster when testing with firefox. 
> That turned original decision into bad one.
> 
> Second is to reorganize loop instructions so that after loop ends I could 
> simply find last byte without recalculating much, using trick that last
> 16 bit mask could be ored with previous three as its relevant only when
> previous three were zero.
> 
> Final one is that gcc generates bad loops in regards where to increment
> pointers. You should place them after loads that use them, not at start
> of loop like gcc does. That change is responsible for 10% improvement
> for large sizes.
> 
> Final are microoptimizations that save few bytes without measurable
> performance impact like using eax instead rax to save byte or moving
> unnecessary zeroing instruction when they are not needed.
> 
> Profile data are here, shortly with avx2 for haswell that I will submit
> next.
> 
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strcmp_profile.html
> 
> OK to commit this?
> 
Here is a avx2 loop that I promised earlier. Luckily it gives small 2% 
practical benefit on gcc workload. Also it does improve 
performance on longer inputs twice.

	* sysdeps/x86_64/multiarch/Makefile: Add strcmp-avx2.
	* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add __strcmp_avx2.
	* sysdeps/x86_64/multiarch/strcmp-avx2.S: New file.
	* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Add avx2
	loop.
	* sysdeps/x86_64/multiarch/strcmp.S: Add ifunc.

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d01bbbe..bf48283 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -30,7 +30,7 @@ CFLAGS-strspn-c.c += -msse4
 endif
 
 ifeq (yes,$(config-cflags-avx2))
-sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2
+sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2  strcmp-avx2
 endif
 endif
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index cc6f9f2..57ce237 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -126,7 +126,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcmp.S.  */
   IFUNC_IMPL (i, name, strcmp,
-	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_AVX2, __strcmp_avx2)
 	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
new file mode 100644
index 0000000..b2f8478
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AVX2
+#define __strcmp_sse2_unaligned __strcmp_avx2
+#include "strcmp-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index 03d1b11..10bed9a 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -89,12 +89,35 @@ L(main_loop_header):
 	subq	%rsi, %rcx
 	shrq	$6, %rcx
 	movq	%rcx, %rsi
-
+#ifdef USE_AVX2
+	vpxor	%xmm7, %xmm7, %xmm7
+#endif
 	.p2align 4
 L(loop):
 	add	$-1, %rsi
 	ja	L(loop_cross_page)
 L(back_to_loop):
+#ifdef USE_AVX2
+	vmovdqu	(%rdx), %ymm0
+	vmovdqu	32(%rdx), %ymm1
+	vpcmpeqb (%rax), %ymm0, %ymm0
+	vpminub (%rax), %ymm0, %ymm0
+	vpcmpeqb (%rax), %ymm1, %ymm1
+	vpminub (%rax), %ymm1, %ymm1
+	vpminub	%ymm0, %ymm1, %ymm2
+	vpcmpeqb %ymm7, %ymm2, %ymm2
+	addq	$64, %rax
+	addq	$64, %rdx
+	vpmovmskb %ymm2, %esi
+	test	%esi, %esi
+	je	L(loop)
+	shl	$32, %rsi
+	vpcmpeqb %ymm7, %ymm0, %ymm0
+	vpmovmskb %ymm0, %ecx
+	or	%rsi, %rcx
+	vzeroupper
+#else
+
 	movdqu	(%rdx), %xmm0
 	movdqu	16(%rdx), %xmm1
 	movdqa	(%rax), %xmm2
@@ -132,14 +155,17 @@ L(back_to_loop):
 	orq	%rdi, %rcx
 	sal	$16, %esi
 	orq	%rsi, %rcx
+#endif
 	bsfq	%rcx, %rcx
 	movzbl	-64(%rax, %rcx), %eax
 	movzbl	-64(%rdx, %rcx), %edx
 	subl	%edx, %eax
 	ret
-
 	.p2align 4
 L(loop_cross_page):
+#ifdef USE_AVX2
+	vzeroupper
+#endif
 	xor	%ecx, %ecx
 	movq	%rdx, %r9
 	and	$63, %r9
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index f50f26c..867e9d4 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -90,6 +90,12 @@ ENTRY(STRCMP)
 	call	__init_cpu_features
 1:
 #ifdef USE_AS_STRCMP
+# ifdef HAVE_AVX2_SUPPORT
+
+        leaq    __strcmp_avx2(%rip), %rax
+        testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+	jnz	3f
+# endif
 	leaq	__strcmp_sse2_unaligned(%rip), %rax
 	testl   $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
 	jnz     3f