[neleai/string-x64] Faster strcasecmp.

Message ID	20150619170127.GA17567@domone
State	New, archived
Headers	Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org Date: Fri, 19 Jun 2015 19:01:27 +0200 From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz> To: libc-alpha@sourceware.org Subject: [PATCH neleai/string-x64] Faster strcasecmp. Message-ID: <20150619170127.GA17567@domone> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.20 (2009-06-14)

Message ID

20150619170127.GA17567@domone

State

New, archived

Headers

Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
Sender: libc-alpha-owner@sourceware.org
Date: Fri, 19 Jun 2015 19:01:27 +0200
From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz>
To: libc-alpha@sourceware.org
Subject: [PATCH neleai/string-x64] Faster strcasecmp.
Message-ID: <20150619170127.GA17567@domone>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
User-Agent: Mutt/1.5.20 (2009-06-14)

Commit Message

Ondrej Bilka June 19, 2015, 5:01 p.m. UTC

Hi, this in another patch from my backlog. It adapted my strcmp approach
to strcasecmp.

On my todo list I had: find profiling information of case mismatch. Then
I had suspicion that vectorized tolower harms performance.

Now I could check that with dryrun data but would still need more for
definitive answer. So far I got that these mismatches are rare, occurs
less than once per 100 characters. That makes using vector tolower
questionable as it cost cycles but saves fewer cycles by filtering rare
false positives, a better would do normal tolower conversion. One of
factors why that happens is that uppercase letters itself are rare.

Second was technical problem as I needed to export table outside glibc,
for now I solve it by getting libc implementation with dlopen.

Its 60% faster on gcc workload on haswell than current sse4 code.

http://kam.mff.cuni.cz/~ondra/benchmark_string/haswell/strcasecmp_profile/results_gcc/result.html

Most of that gain could be explained that we start with

if (table[a[0]]-table[b[0]])
return table[a[0]]-table[b[0]];

When there are larger sizes my strcasecmp is three times faster than current
one. This depends on number of case mismatches.

http://kam.mff.cuni.cz/~ondra/benchmark_string/haswell/strcasecmp_profile/results_rand/result.html

Results for other cpu are found here and profiler itself is in tarball below.
http://kam.mff.cuni.cz/~ondra/benchmark_string/strcasecmp_profile.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/strcasecmp_profile190615.tar.bz2

Comments?

* sysdeps/x86_64/locale-defines.sym: Add LOCALE_TOLOWER.
* sysdeps/x86_64/multiarch/Makefile (routines): Add
strcasecmp_l-sse2-unaligned
* sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S:
New file.
* sysdeps/x86_64/multiarch/strcasecmp_l.S: Inline ifunc selector
and use unaligned strcasecmp.

---
sysdeps/x86_64/locale-defines.sym | 1 +
sysdeps/x86_64/multiarch/Makefile | 1 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +-
.../x86_64/multiarch/strcasecmp_l-sse2-unaligned.S | 294 +++++++++++++++++++++
sysdeps/x86_64/multiarch/strcasecmp_l.S | 104 +++++++-
5 files changed, 401 insertions(+), 11 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S

diff mbox

Patch

diff --git a/sysdeps/x86_64/locale-defines.sym b/sysdeps/x86_64/locale-defines.sym
index aebff9a..804debb 100644
--- a/sysdeps/x86_64/locale-defines.sym
+++ b/sysdeps/x86_64/locale-defines.sym
@@ -8,4 +8,5 @@  LOCALE_T___LOCALES		offsetof (struct __locale_struct, __locales)
 LC_CTYPE
 _NL_CTYPE_NONASCII_CASE
 LOCALE_DATA_VALUES		offsetof (struct __locale_data, values)
+LOCALE_TOLOWER			offsetof (struct __locale_struct, __ctype_tolower)
 SIZEOF_VALUES			sizeof (((struct __locale_data *) 0)->values[0])
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 679db2a..d01bbbe 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -14,6 +14,7 @@  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
 		   memmove-ssse3-back strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
+		   strcasecmp_l-sse2-unaligned \
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index b3dbe65..cc6f9f2 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -94,20 +94,16 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S.  */
   IFUNC_IMPL (i, name, strcasecmp,
-	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_AVX,
-			      __strcasecmp_avx)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2,
-			      __strcasecmp_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1,
+			      __strcasecmp_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3,
 			      __strcasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_AVX,
-			      __strcasecmp_l_avx)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSE4_2,
-			      __strcasecmp_l_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
+			      __strcasecmp_sse2_unaligned_l)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSSE3,
 			      __strcasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
new file mode 100644
index 0000000..977e028
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
@@ -0,0 +1,294 @@ 
+/* strcasecmp with unaligned loads
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef NOT_IN_libc
+
+#include "sysdep.h"
+
+# include "locale-defines.h"
+
+
+ENTRY (__strcasecmp_sse2_unaligned_l)
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %rdx
+        // XXX 5 byte should be before the function
+        /* 5-byte NOP.  */
+	.byte	0x0f,0x1f,0x44,0x00,0x00
+
+END (__strcasecmp_sse2_unaligned_l)
+
+ENTRY (__strcasecmp_sse2_unaligned)
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	mov	LOCALE_TOLOWER(%rdx), %r11
+	movl	(%r11,%rax,4), %eax
+	subl	(%r11,%rcx,4), %eax
+	je	L(next)
+L(return):
+	ret	
+L(next):
+	test	%ecx, %ecx
+	je	L(return)
+	leaq	1(%rsi), %rsi
+	leaq	1(%rdi), %rdi
+
+	pxor	%xmm7, %xmm7
+	movl	%edi, %eax
+	andl	$4095, %eax
+	cmpl	$4032, %eax
+	jg	L(cross_page_start)
+	movl	%esi, %eax
+	andl	$4095, %eax
+	cmpl	$4032, %eax
+	jg	L(cross_page_start)
+	movdqu	(%rdi), %xmm1
+	movdqu	(%rsi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pminub	%xmm1, %xmm0
+	pxor	%xmm1, %xmm1
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	je	L(next_48_bytes)
+		
+L(loop1):
+	bsf	%ecx, %r8d
+	movzbl	(%rdi,%r8), %eax
+	movzbl	(%rsi,%r8), %r8d
+	movl	(%r11,%rax,4), %eax
+	subl	(%r11,%r8,4), %eax
+	jne	L(return)
+	test	%r8d, %r8d
+	je	L(return)
+	leaq	-1(%rcx), %rax
+	andq	%rax, %rcx
+	je	L(next_48_bytes)
+ jmp L(loop1)
+
+
+ .p2align 4
+L(next_48_bytes):
+	movdqu	16(%rdi), %xmm6
+	movdqu	16(%rsi), %xmm3
+	movdqu	32(%rdi), %xmm5
+	pcmpeqb	%xmm6, %xmm3
+	movdqu	32(%rsi), %xmm2
+	pminub	%xmm6, %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	movdqu	48(%rdi), %xmm4
+	pcmpeqb	%xmm5, %xmm2
+	pmovmskb %xmm3, %edx
+	movdqu	48(%rsi), %xmm0
+	pminub	%xmm5, %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm2, %eax
+	sal	$16, %edx
+	pminub	%xmm4, %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	salq	$32, %rax
+	orq	%rdx, %rax
+	pmovmskb %xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rax, %rcx
+	jne	L(caseloop2)
+L(main_loop_header):	
+	leaq	64(%rdi), %rdx
+	movl	$4096, %ecx
+	pxor	%xmm9, %xmm9
+	andq	$-64, %rdx
+	subq	%rdi, %rdx
+	leaq	(%rdi, %rdx), %rax
+	addq	%rsi, %rdx
+	movq	%rdx, %rsi
+	andl	$4095, %esi
+	subq	%rsi, %rcx
+	shrq	$6, %rcx
+	movq	%rcx, %rsi
+	jmp	L(loop_start)
+		
+L(caseloop2):	
+	bsf	%rcx, %r8
+	movzbl	(%rdi,%r8), %eax
+	movzbl	(%rsi,%r8), %r8d
+	movl	(%r11,%rax,4), %eax
+	subl	(%r11,%r8,4), %eax
+	jne	L(return)
+	test	%r8d, %r8d
+	je	L(return)
+	leaq	-1(%rcx), %rax
+	andq	%rax, %rcx
+	je	L(main_loop_header)
+	jmp	L(caseloop2)
+		
+		
+		
+L(caseloop3):	
+	bsfq	%rax, %rdx
+	leaq	-1(%rax), %r10
+	andq	%rax, %r10
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	movl	(%r11, %rax, 4), %eax
+	movl	(%r11, %rdx, 4), %edx
+	testl	%eax, %eax
+	je	L(zero3)
+	cmpl	%edx, %eax
+	je	L(casecnt3)
+L(zero3):	
+	subl	%edx, %eax
+	ret	
+L(casecnt3):	
+	movq	%rdi, %rax
+	movq	%rsi, %rdx
+	testq	%r10, %r10
+	je	L(back_to_loop)
+	movq	%r10, %rax
+	jmp	L(caseloop3)
+		
+		
+		
+	.p2align 4
+L(loop):	
+	addq	$64, %rax
+	addq	$64, %rdx
+L(loop_start):	
+	testq	%rsi, %rsi
+	leaq	-1(%rsi), %rsi
+	je	L(loop_cross_page)
+L(back_to_loop):	
+	movdqu	(%rdx), %xmm0
+	movdqu	16(%rdx), %xmm1
+	movdqa	(%rax), %xmm2
+	movdqa	16(%rax), %xmm3
+	pcmpeqb	%xmm2, %xmm0
+	movdqu	32(%rdx), %xmm5
+	pcmpeqb	%xmm3, %xmm1
+	pminub	%xmm2, %xmm0
+	movdqu	48(%rdx), %xmm6
+	pminub	%xmm3, %xmm1
+	movdqa	32(%rax), %xmm2
+	pminub	%xmm1, %xmm0
+	movdqa	48(%rax), %xmm3
+	pcmpeqb	%xmm2, %xmm5
+	pcmpeqb	%xmm3, %xmm6
+	pminub	%xmm2, %xmm5
+	pminub	%xmm3, %xmm6
+	pminub	%xmm5, %xmm0
+	pminub	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %ecx
+	testl	%ecx, %ecx
+	je	L(loop)
+	pcmpeqb	%xmm7, %xmm5
+	movdqu	(%rdx), %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	movdqa	(%rax), %xmm2
+	pcmpeqb	%xmm2, %xmm0
+	pminub	%xmm2, %xmm0
+	pcmpeqb	%xmm7, %xmm6
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm1, %ecx
+	pmovmskb %xmm5, %r8d
+	pmovmskb %xmm0, %edi
+	salq	$16, %rcx
+	salq	$32, %r8
+	pmovmskb %xmm6, %esi
+	orq	%r8, %rcx
+	orq	%rdi, %rcx
+	salq	$48, %rsi
+	orq	%rsi, %rcx
+	movq	%rax, %rdi
+	movq	%rdx, %rsi
+	jmp	L(caseloop2)
+		
+	.p2align 4
+L(loop_cross_page):	
+	xor	%r10, %r10
+	movq	%rdx, %r9
+	and	$63, %r9
+	subq	%r9, %r10
+		
+	movdqa	(%rdx, %r10), %xmm0
+	movdqa	16(%rdx, %r10), %xmm1
+	movdqu	(%rax, %r10), %xmm2
+	movdqu	16(%rax, %r10), %xmm3
+	pcmpeqb	%xmm2, %xmm0
+	movdqa	32(%rdx, %r10), %xmm5
+	pcmpeqb	%xmm3, %xmm1
+	pminub	%xmm2, %xmm0
+	movdqa	48(%rdx, %r10), %xmm6
+	pminub	%xmm3, %xmm1
+	movdqu	32(%rax, %r10), %xmm2
+	movdqu	48(%rax, %r10), %xmm3
+	pcmpeqb	%xmm2, %xmm5
+	pcmpeqb	%xmm3, %xmm6
+	pminub	%xmm2, %xmm5
+	pminub	%xmm3, %xmm6
+		
+	pcmpeqb	%xmm7, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pcmpeqb	%xmm7, %xmm5
+	pcmpeqb	%xmm7, %xmm6
+		
+	pmovmskb %xmm1, %ecx
+	pmovmskb %xmm5, %r8d
+	pmovmskb %xmm0, %edi
+	salq	$16, %rcx
+	salq	$32, %r8
+	pmovmskb %xmm6, %esi
+	orq	%r8, %rdi
+	orq	%rcx, %rdi
+	salq	$48, %rsi
+	orq	%rsi, %rdi
+	movq	%r9, %rcx
+	movq	$63, %rsi
+	shrq	%cl, %rdi
+	test	%rdi, %rdi
+	je	L(back_to_loop)
+	movq	%rdi, %r10
+	movq	%rax, %rdi
+	movq	%rdx, %rsi
+	movq	%r10, %rax
+	jmp	L(caseloop3)
+		
+L(cross_page_start):	
+	xor	%edx, %edx
+	jmp	L(cross_page)
+	.p2align 4
+L(cross_page_loop):	
+	addq	$1, %rdx
+	cmpq	$64, %rdx
+	je	L(main_loop_header)
+L(cross_page):	
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %ecx
+	cmp	%eax, %ecx
+	je	L(skip_table)
+	movl	(%r11, %rax, 4), %eax
+	subl	(%r11, %rcx, 4), %ecx
+	jne	L(ret2)
+L(skip_table):	
+	test	%eax, %eax
+	jne	L(cross_page_loop)
+L(ret2):	
+	ret	
+		
+		
+END (__strcasecmp_sse2_unaligned)
+#endif	
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l.S b/sysdeps/x86_64/multiarch/strcasecmp_l.S
index 49f5b9f..6cad73f 100644
--- a/sysdeps/x86_64/multiarch/strcasecmp_l.S
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l.S
@@ -1,8 +1,106 @@ 
-/* Multiple versions of strcasecmp and strcasecmp_l
-   All versions must be listed in ifunc-impl-list.c.  */
+/* Multiple versions of strcasecmp
+   Copyright (C) 2009-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
 #define STRCMP __strcasecmp_l
 #define USE_AS_STRCASECMP_L
-#include "strcmp.S"
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+# include "locale-defines.h"
+
+
+# define STRCMP_SSSE3	__strcasecmp_l_ssse3
+# define STRCMP_SSE2	__strcasecmp_l_sse2
+# define __GI_STRCMP	__GI___strcasecmp_l
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strncmp in static library since we
+   need strncmp before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc)
+	.text
+ENTRY(STRCMP)
+	.type	STRCMP, @gnu_indirect_function
+	/* Manually inlined call to __get_cpu_features.  */
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:
+	leaq	__strcasecmp_sse2_unaligned_l(%rip), %rax
+	testl   $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	jnz     3f
+2:	leaq	STRCMP_SSSE3(%rip), %rax
+	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jnz	3f
+	leaq	STRCMP_SSE2(%rip), %rax
+3:	ret
+END(STRCMP)
+
+ENTRY(__strcasecmp)
+	.type	__strcasecmp, @gnu_indirect_function
+	/* Manually inlined call to __get_cpu_features.  */
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:
+	leaq	__strcasecmp_sse2_unaligned(%rip), %rax
+	testl   $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	jnz     3f
+2:	leaq	__strcasecmp_ssse3(%rip), %rax
+	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jnz	3f
+	leaq	__strcasecmp_sse2(%rip), %rax
+3:	ret
+
+END(__strcasecmp)
+weak_alias (__strcasecmp, strcasecmp)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCMP_SSE2, @function; \
+	.align 16; \
+	.globl STRCMP_SSE2; \
+	.hidden STRCMP_SSE2; \
+	STRCMP_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2
+
+#  define ENTRY2(name) \
+	.type __strcasecmp_sse2, @function; \
+	.align 16; \
+	.globl __strcasecmp_sse2; \
+	.hidden __strcasecmp_sse2; \
+	__strcasecmp_sse2: cfi_startproc; \
+	CALL_MCOUNT
+#  define END2(name) \
+	cfi_endproc; .size __strcasecmp_sse2, .-__strcasecmp_sse2
+
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcmp calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
+#endif
+
+#include "../strcmp.S"
 
 weak_alias (__strcasecmp_l, strcasecmp_l)
 libc_hidden_def (strcasecmp_l)