From patchwork Fri Jun 26 07:12:54 2015
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Ondrej Bilka <neleai@seznam.cz>
X-Patchwork-Id: 7356
Received: (qmail 45165 invoked by alias); 26 Jun 2015 07:13:37 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 45155 invoked by uid 89); 26 Jun 2015 07:13:37 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=0.7 required=5.0 tests=AWL, BAYES_50,
	FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2
X-HELO: popelka.ms.mff.cuni.cz
Date: Fri, 26 Jun 2015 09:12:54 +0200
From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz>
To: libc-alpha@sourceware.org
Subject: [PATCH neleai/string-x64] Reoptimize strlen and strnlen
Message-ID: <20150626071254.GA1789@domone>
MIME-Version: 1.0
Content-Disposition: inline
User-Agent: Mutt/1.5.20 (2009-06-14)

Hi,

I optimized strlen long ago, then my main focus was improve performance
for core2 and have reasonable performance for athlons and old atoms.

Main change is that I check 16-64th byte unaligned instead aligning
these to 16 bytes. That improved performance on older processors but now
unaligned loads are better on i7. I don't remember if last time I keept
xoring first four xmm registers when checking unaligned loads or read
from (%rax) instead (%rdi) which increased latency but now simple
unaligned loads are faster also on core2

Then I made several microoptimizations like using edx instead rdx to
save space or reorder to improve instruction scheduling.

Also I tested avx2 version, again it doesn't help much, on haswell
performance difference is 0.2% while new sse2 is 1% faster on haswell.

Full graphs are here, only problem I could find is 0.3% decrease on
fx10.

I could reintroduce ifunc to handle atom and avx2 but is that worth it?

http://kam.mff.cuni.cz/~ondra/benchmark_string/strlen_profile.html

Ok to commit this?

	* sysdeps/x86_64/strlen.S (strlen): Add microoptimizations.
---
 sysdeps/x86_64/strlen.S | 336 ++++++++++++++++++++++++------------------------
 1 file changed, 169 insertions(+), 167 deletions(-)

diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index c382c8d..3e8beb0 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,5 +1,5 @@
 /* SSE2 version of strlen.
-   Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   Copyright (C) 2012-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -18,222 +18,224 @@
 
 #include <sysdep.h>
 
-/* Long lived register in strlen(s), strnlen(s, n) are:
-
-	%xmm11 - zero
-	%rdi   - s
-	%r10  (s+n) & (~(64-1))
-	%r11   s+n
-*/
 
 
 .text
 ENTRY(strlen)
-
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
-#define FIND_ZERO	\
-	pcmpeqb	(%rax), %xmm8;	\
-	pcmpeqb	16(%rax), %xmm9;	\
-	pcmpeqb	32(%rax), %xmm10;	\
-	pcmpeqb	48(%rax), %xmm11;	\
-	pmovmskb	%xmm8, %esi;	\
-	pmovmskb	%xmm9, %edx;	\
-	pmovmskb	%xmm10, %r8d;	\
-	pmovmskb	%xmm11, %ecx;	\
-	salq	$16, %rdx;	\
-	salq	$16, %rcx;	\
-	orq	%rsi, %rdx;	\
-	orq	%r8, %rcx;	\
-	salq	$32, %rcx;	\
-	orq	%rcx, %rdx;
-
 #ifdef AS_STRNLEN
-/* Do not read anything when n==0.  */
+	mov	%rsi, %r8
+	xor	%edx, %edx
 	test	%rsi, %rsi
-	jne	L(n_nonzero)
-	xor	%rax, %rax
-	ret
-L(n_nonzero):
-
-/* Initialize long lived registers.  */
-
-	add	%rdi, %rsi
-	mov	%rsi, %r10
-	and	$-64, %r10
-	mov	%rsi, %r11
+	je	L(return_zero)
+	cmp	$64,  %rsi
+	jae	L(dont_set)
+	bts	%rsi, %rdx
+L(dont_set):
 #endif
-
-	pxor	%xmm8, %xmm8
-	pxor	%xmm9, %xmm9
-	pxor	%xmm10, %xmm10
-	pxor	%xmm11, %xmm11
-	movq	%rdi, %rax
-	movq	%rdi, %rcx
-	andq	$4095, %rcx
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
-	cmpq	$4047, %rcx
-/* We cannot unify this branching as it would be ~6 cycles slower.  */
+	pxor	%xmm0, %xmm0
+	mov	%edi, %ecx
+	and	$4095, %ecx
+	cmp	$4032, %ecx
 	ja	L(cross_page)
-
+	movdqu	(%rdi), %xmm4
+	pcmpeqb	%xmm0, %xmm4
+	pmovmskb %xmm4, %ecx
 #ifdef AS_STRNLEN
-/* Test if end is among first 64 bytes.  */
-# define STRNLEN_PROLOG	\
-	mov	%r11, %rsi;	\
-	subq	%rax, %rsi;	\
-	andq	$-64, %rax;	\
-	testq	$-64, %rsi;	\
-	je	L(strnlen_ret)
+	or	%dx, %cx
 #else
-# define STRNLEN_PROLOG  andq $-64, %rax;
+	test	%ecx, %ecx
 #endif
-
-/* Ignore bits in mask that come before start of string.  */
-#define PROLOG(lab)	\
-	movq	%rdi, %rcx;	\
-	xorq	%rax, %rcx;	\
-	STRNLEN_PROLOG;	\
-	sarq	%cl, %rdx;	\
-	test	%rdx, %rdx;	\
-	je	L(lab);	\
-	bsfq	%rdx, %rax;	\
+	je	L(next48_bytes)
+	bsf	%ecx, %eax
 	ret
 
 #ifdef AS_STRNLEN
-	andq	$-16, %rax
-	FIND_ZERO
-#else
-	/* Test first 16 bytes unaligned.  */
-	movdqu	(%rax), %xmm12
-	pcmpeqb	%xmm8, %xmm12
-	pmovmskb	%xmm12, %edx
-	test	%edx, %edx
-	je 	L(next48_bytes)
-	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+L(return_zero):
+	xor	%eax, %eax
 	ret
-
+L(return_noread):
+	add	$64, %rax
+	sub	%rdi, %rax
+	ret
+#endif
+	.p2align 4
 L(next48_bytes):
-/* Same as FIND_ZERO except we do not check first 16 bytes.  */
-	andq	$-16, %rax
-	pcmpeqb 16(%rax), %xmm9
-	pcmpeqb 32(%rax), %xmm10
-	pcmpeqb 48(%rax), %xmm11
-	pmovmskb	%xmm9, %edx
-	pmovmskb	%xmm10, %r8d
-	pmovmskb	%xmm11, %ecx
-	salq	$16, %rdx
-	salq	$16, %rcx
-	orq	%r8, %rcx
+	movdqu	16(%rdi), %xmm1
+	movdqu	32(%rdi), %xmm2
+	movdqu	48(%rdi), %xmm3
+	pcmpeqb	%xmm0, %xmm1
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm0, %xmm3
+#ifdef AS_STRNLEN
+	pmovmskb %xmm1, %ecx
+	sal	$16, %ecx
+	or	%rcx, %rdx
+#else
+	pmovmskb %xmm1, %edx
+	sal	$16, %edx
+#endif
+	pmovmskb %xmm2, %esi
+	pmovmskb %xmm3, %ecx
+	sal	$16, %ecx
+	or	%esi, %ecx
 	salq	$32, %rcx
 	orq	%rcx, %rdx
-#endif
-
-	/* When no zero byte is found xmm9-11 are zero so we do not have to
-	   zero them.  */
-	PROLOG(loop)
+	je	L(loop_init)
+	bsfq	%rdx, %rax
+	ret
 
 	.p2align 4
 L(cross_page):
-	andq	$-64, %rax
-	FIND_ZERO
-	PROLOG(loop_init)
 
+	movq	%rdi, %rax
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
 #ifdef AS_STRNLEN
-/* We must do this check to correctly handle strnlen (s, -1).  */
-L(strnlen_ret):
-	bts	%rsi, %rdx
+	mov	%rdx, %r9
+#endif
+	andq	$-64, %rax
+	pcmpeqb	(%rax), %xmm0
+	pcmpeqb	16(%rax), %xmm1
+	pcmpeqb	32(%rax), %xmm2
+	pcmpeqb	48(%rax), %xmm3
+	pmovmskb %xmm0, %esi
+	pxor	%xmm0, %xmm0
+	pmovmskb %xmm1, %edx
+	pmovmskb %xmm2, %r10d
+	pmovmskb %xmm3, %ecx
+	sal	$16, %edx
+	sal	$16, %ecx
+	or	%esi, %edx
+	or	%r10, %rcx
+	salq	$32, %rcx
+	orq	%rcx, %rdx
+	mov	%edi, %ecx
+#ifdef AS_STRNLEN
+	salq	%cl, %r9
+	or	%r9, %rdx
+#endif
 	sarq	%cl, %rdx
 	test	%rdx, %rdx
 	je	L(loop_init)
 	bsfq	%rdx, %rax
 	ret
-#endif
 	.p2align 4
 L(loop_init):
-	pxor	%xmm9, %xmm9
-	pxor	%xmm10, %xmm10
-	pxor	%xmm11, %xmm11
+	movq	%rdi, %rax
+	andq	$-64, %rax
 #ifdef AS_STRNLEN
+	add	%rdi, %r8
+	sub	%rax, %r8
+	cmp	$64, %r8
+	je	L(return_noread)
+#endif
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+#ifdef USE_AVX2
+	vpxor	%xmm0, %xmm0, %xmm0
+#endif
 	.p2align 4
 L(loop):
+#ifdef USE_AVX2
+	vmovdqa	64(%rax), %ymm1
+	vpminub	96(%rax), %ymm1, %ymm2
+	vpcmpeqb %ymm0, %ymm2, %ymm2
+	vpmovmskb %ymm2, %edx
+#else
+	movdqa	64(%rax), %xmm5
+	pminub	80(%rax), %xmm5
+	pminub	96(%rax), %xmm5
+	pminub	112(%rax), %xmm5
+	pcmpeqb	%xmm0, %xmm5
+	pmovmskb %xmm5, %edx
+#endif
 
-	addq	$64, %rax
-	cmpq	%rax, %r10
-	je	L(exit_end)
-
-	movdqa	(%rax), %xmm8
-	pminub	16(%rax), %xmm8
-	pminub	32(%rax), %xmm8
-	pminub	48(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+#ifdef AS_STRNLEN
+	sub	$64, %r8
 	testl	%edx, %edx
-	jne	L(exit)
-	jmp	L(loop)
-
-	.p2align 4
-L(exit_end):
-	cmp	%rax, %r11
-	je	L(first) /* Do not read when end is at page boundary.  */
-	pxor	%xmm8, %xmm8
-	FIND_ZERO
-
-L(first):
-	bts	%r11, %rdx
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
-	ret
-
-	.p2align 4
-L(exit):
-	pxor	%xmm8, %xmm8
-	FIND_ZERO
-
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
-	ret
-
+	jne	L(exit64)
+	cmp	$64, %r8
+	jbe	L(exit64_zero)
 #else
-
-	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
-	.p2align 4
-L(loop):
-
-	movdqa	64(%rax), %xmm8
-	pminub	80(%rax), %xmm8
-	pminub	96(%rax), %xmm8
-	pminub	112(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
 	testl	%edx, %edx
 	jne	L(exit64)
+#endif
 
 	subq	$-128, %rax
-
-	movdqa	(%rax), %xmm8
-	pminub	16(%rax), %xmm8
-	pminub	32(%rax), %xmm8
-	pminub	48(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+#ifdef USE_AVX2
+	vmovdqa	(%rax), %ymm1
+	vpminub	32(%rax), %ymm1, %ymm2
+	vpcmpeqb %ymm0, %ymm2, %ymm2
+	vpmovmskb %ymm2, %edx
+#else
+	movdqa	(%rax), %xmm5
+	pminub	16(%rax), %xmm5
+	pminub	32(%rax), %xmm5
+	pminub	48(%rax), %xmm5
+	pcmpeqb	%xmm0, %xmm5
+	pmovmskb %xmm5, %edx
+#endif
+#ifdef AS_STRNLEN
+	sub	$64, %r8
 	testl	%edx, %edx
 	jne	L(exit0)
+	cmp	$64, %r8
+	jbe	L(exit0_zero)
+#else
+	testl	%edx, %edx
+	jne	L(exit0)
+#endif
 	jmp	L(loop)
 
+#ifdef AS_STRNLEN
+	.p2align 4
+L(exit64_zero):
+	addq	$64, %rax
+L(exit0_zero):
+	add	%r8, %rax
+	sub	%rdi, %rax
+	ret
+#endif
 	.p2align 4
+
+
 L(exit64):
 	addq	$64, %rax
 L(exit0):
-	pxor	%xmm8, %xmm8
-	FIND_ZERO
-
+#ifdef USE_AVX2
+	sal	$32, %rdx
+#else
+	sal	$48, %rdx
+#endif
+#ifdef AS_STRNLEN
+	cmp	$64, %r8
+	jae	L(dont_set2)
+	bts	%r8, %rdx
+	L(dont_set2):
+#endif
+#ifdef USE_AVX2
+	subq    %rdi, %rax
+	vpcmpeqb %ymm0, %ymm1, %ymm1
+	vpmovmskb %ymm1, %ecx
+	vzeroupper
+	or	%rcx, %rdx
+#else
+	pcmpeqb	(%rax), %xmm0
+	pcmpeqb	16(%rax), %xmm1
+	pcmpeqb	32(%rax), %xmm2
+	subq	%rdi, %rax
+	pmovmskb %xmm0, %esi
+	pmovmskb %xmm1, %ecx
+	pmovmskb %xmm2, %r8d
+	sal	$16, %ecx
+	or	%esi, %ecx
+	salq	$32, %r8
+	orq	%r8, %rcx
+	orq	%rcx, %rdx
+#endif
 	bsfq	%rdx, %rdx
 	addq	%rdx, %rax
-	subq	%rdi, %rax
 	ret
-
-#endif
-
 END(strlen)
 libc_hidden_builtin_def (strlen)