From patchwork Sat Jun 20 17:48:04 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Ondrej Bilka X-Patchwork-Id: 7271 Received: (qmail 60108 invoked by alias); 20 Jun 2015 17:48:25 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 60072 invoked by uid 89); 20 Jun 2015 17:48:23 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=2.1 required=5.0 tests=AWL, BAYES_99, BAYES_999, FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2 X-HELO: popelka.ms.mff.cuni.cz Date: Sat, 20 Jun 2015 19:48:04 +0200 From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= To: libc-alpha@sourceware.org Subject: [PATCH neleai/string-x64] Optimize strncmp with unaligned loads. Message-ID: <20150620174804.GA20938@domone> References: <20150620083525.GA31992@domone> <20150620102256.GA16801@domone> <20150620103548.GA21670@domone> MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: <20150620103548.GA21670@domone> User-Agent: Mutt/1.5.20 (2009-06-14) Hi, This uses refactored strcmp to add strncmp functionality. Basic idea is simple, at start initialize register with 1<<(n-1) when n <64 on unaligned header and or that with masks. From what I tried that gives smallest slowdown versus strcmp. For larger sizes I modify counter when we need to do unaligned load cross page to also trigger when we need inspect less than 64 characters. Performance with these tricks is nearly identical as strcmp. OK to add this? Profile graphs are here: http://kam.mff.cuni.cz/~ondra/benchmark_string/strncmp_profile.html * sysdeps/x86_64/multiarch/Makefile (routines): Add strncmp-avx2.S and strncmp-sse2-unaligned.S. * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add __strncmp_sse2_unaligned and __strncmp_avx2. * sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Add strncmp functionality. * sysdeps/x86_64/multiarch/strcmp.S: Adjust ifunc. * sysdeps/x86_64/multiarch/strncmp-avx2.S: New file. * sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S: Likewise. --- sysdeps/x86_64/multiarch/Makefile | 4 +- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 5 +- sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 78 +++++++++++++++++++++-- sysdeps/x86_64/multiarch/strcmp.S | 37 ++++++----- sysdeps/x86_64/multiarch/strncmp-avx2.S | 4 ++ sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S | 3 + 6 files changed, 104 insertions(+), 27 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/strncmp-avx2.S create mode 100644 sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index bf48283..95e0190 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -7,7 +7,7 @@ endif ifeq ($(subdir),string) sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ - strcmp-sse2-unaligned strncmp-ssse3 \ + strcmp-sse2-unaligned strncmp-sse2-unaligned strncmp-ssse3 \ memcpy-ssse3 \ memcpy-sse2-unaligned mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ @@ -30,7 +30,7 @@ CFLAGS-strspn-c.c += -msse4 endif ifeq (yes,$(config-cflags-avx2)) -sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 strcmp-avx2 +sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 strcmp-avx2 strncmp-avx2 endif endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 57ce237..51ff3ed 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -257,8 +257,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strncmp.S. */ IFUNC_IMPL (i, name, strncmp, - IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2, - __strncmp_sse42) + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strncmp, HAS_AVX2, __strncmp_avx2) + IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSSE3, __strncmp_ssse3) IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S index 8258eb8..f3a0508 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S @@ -19,6 +19,14 @@ #include "sysdep.h" ENTRY ( __strcmp_sse2_unaligned) +#ifdef AS_STRNCMP + lea -1(%rdx), %r10 + test %rdx, %rdx + je L(ret_zero) +L(back_to_start): + xor %rdx, %rdx +#endif + pxor %xmm7, %xmm7 movl %esi, %eax andl $4095, %eax @@ -29,20 +37,35 @@ ENTRY ( __strcmp_sse2_unaligned) andl $4095, %eax cmpl $4032, %eax jg L(cross_page) +#ifdef AS_STRNCMP + cmp $64, %r10 + jae L(dont_set_mask) + bts %r10, %rdx +L(dont_set_mask): +#endif + movdqu (%rdi), %xmm1 movdqu (%rsi), %xmm0 pcmpeqb %xmm1, %xmm0 pminub %xmm1, %xmm0 pcmpeqb %xmm7, %xmm0 pmovmskb %xmm0, %eax +#ifdef AS_STRNCMP + or %dx, %ax +#else test %eax, %eax +#endif je L(next_48_bytes) bsf %eax, %edx movzbl (%rdi, %rdx), %eax movzbl (%rsi, %rdx), %edx subl %edx, %eax ret - +#ifdef AS_STRNCMP + L(ret_zero): + xor %eax, %eax + ret +#endif .p2align 4 L(next_48_bytes): movdqu 16(%rdi), %xmm6 @@ -54,16 +77,19 @@ L(next_48_bytes): pcmpeqb %xmm7, %xmm3 movdqu 48(%rdi), %xmm4 pcmpeqb %xmm5, %xmm2 - pmovmskb %xmm3, %edx movdqu 48(%rsi), %xmm0 pminub %xmm5, %xmm2 pcmpeqb %xmm7, %xmm2 pcmpeqb %xmm4, %xmm0 pmovmskb %xmm2, %eax + salq $32, %rax +#ifdef AS_STRNCMP + or %rdx, %rax +#endif + pmovmskb %xmm3, %edx sal $16, %edx pminub %xmm4, %xmm0 pcmpeqb %xmm7, %xmm0 - salq $32, %rax orq %rdx, %rax pmovmskb %xmm0, %ecx salq $48, %rcx @@ -82,6 +108,10 @@ L(main_loop_header): #endif leaq 64(%rdi), %rdx andq $-64, %rdx +# ifdef AS_STRNCMP + addq %rdi, %r10 + subq %rdx, %r10 +# endif subq %rdi, %rdx leaq (%rdi, %rdx), %rax addq %rsi, %rdx @@ -90,6 +120,15 @@ L(main_loop_header): andl $4095, %ecx sub %ecx, %esi shr $6, %esi +#ifdef AS_STRNCMP + mov %r10, %r9 + addq %rdx, %r10 + shr $6, %r9 + cmp %r9, %rsi + jb L(dont_set_page_bound) + mov %r9, %rsi +L(dont_set_page_bound): +#endif .p2align 4 L(loop): @@ -111,7 +150,7 @@ L(back_to_loop): addq $64, %rdx vpmovmskb %ymm2, %edi test %edi, %edi - je .Lloop + je L(loop) shl $32, %rdi vpcmpeqb %ymm7, %ymm0, %ymm0 vpmovmskb %ymm0, %ecx @@ -164,6 +203,14 @@ L(back_to_loop): .p2align 4 L(loop_cross_page): +#ifdef AS_STRNCMP + mov %r10, %r9 + sub %rdx, %r9 + cmp $64, %r9 + jb L(prepare_back_to_start) +#endif + + mov %edx, %ecx and $63, %ecx neg %rcx @@ -219,6 +266,14 @@ L(loop_cross_page): #endif mov %edx, %ecx mov $63, %esi +#ifdef AS_STRNCMP + shr $6, %r9 + sub $1, %r9 + cmp %r9, %rsi + jb L(dont_set_bound2) + mov %r9, %rsi +L(dont_set_bound2): +#endif shrq %cl, %rdi test %rdi, %rdi je L(back_to_loop) @@ -231,6 +286,18 @@ L(loop_cross_page): subl %edx, %eax ret +#ifdef AS_STRNCMP +L(prepare_back_to_start): +# ifdef USE_AVX2 + vzeroupper +# endif + mov %r9, %r10 + mov %rdx, %rsi + mov %rax, %rdi + jmp L(back_to_start) +#endif + + L(cross_page): xorl %edx, %edx jmp L(cross_page_loop_start) @@ -244,6 +311,9 @@ L(cross_page_loop_start): movzbl (%rsi, %rdx), %ecx subl %ecx, %eax jne L(different) + cmp %rdx, %r10 + je L(different) + test %ecx, %ecx jne L(cross_page_loop) L(different): diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S index 867e9d4..02d22d1 100644 --- a/sysdeps/x86_64/multiarch/strcmp.S +++ b/sysdeps/x86_64/multiarch/strcmp.S @@ -31,8 +31,8 @@ test %r9, %r9; \ je LABEL(strcmp_exitz); \ mov %r9, %r11 - -# define STRCMP_SSE42 __strncmp_sse42 +# define STRCMP_AVX2 __strncmp_avx2 +# define STRCMP_SSE2_UNALIGNED __strncmp_sse2_unaligned # define STRCMP_SSSE3 __strncmp_ssse3 # define STRCMP_SSE2 __strncmp_sse2 # define __GI_STRCMP __GI_strncmp @@ -69,8 +69,9 @@ # define USE_AS_STRCMP # define UPDATE_STRNCMP_COUNTER # ifndef STRCMP +# define STRCMP_AVX2 __strcmp_avx2 +# define STRCMP_SSE2_UNALIGNED __strcmp_sse2_unaligned # define STRCMP strcmp -# define STRCMP_SSE42 __strcmp_sse42 # define STRCMP_SSSE3 __strcmp_ssse3 # define STRCMP_SSE2 __strcmp_sse2 # define __GI_STRCMP __GI_strcmp @@ -89,23 +90,23 @@ ENTRY(STRCMP) jne 1f call __init_cpu_features 1: -#ifdef USE_AS_STRCMP -# ifdef HAVE_AVX2_SUPPORT +# if defined (USE_AS_STRCMP) || defined (USE_AS_STRNCMP) +# ifdef HAVE_AVX2_SUPPORT - leaq __strcmp_avx2(%rip), %rax + leaq STRCMP_AVX2(%rip), %rax testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jnz 3f -# endif - leaq __strcmp_sse2_unaligned(%rip), %rax +# endif + leaq STRCMP_SSE2_UNALIGNED(%rip), %rax testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) jnz 3f -#else +# else testl $bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip) jnz 2f leaq STRCMP_SSE42(%rip), %rax testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) jnz 3f -#endif +# endif 2: leaq STRCMP_SSSE3(%rip), %rax testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) jnz 3f @@ -166,15 +167,13 @@ END(__strncasecmp) weak_alias (__strncasecmp, strncasecmp) # endif -# undef LABEL -# define LABEL(l) .L##l##_sse42 -# define GLABEL(l) l##_sse42 -# define SECTION sse4.2 -# include "strcmp-sse42.S" - - -# ifdef HAVE_AVX_SUPPORT -# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# undef LABEL +# define LABEL(l) .L##l##_sse42 +# define GLABEL(l) l##_sse42 +# define SECTION sse4.2 +# include "strcmp-sse42.S" +# ifdef HAVE_AVX_SUPPORT # define LABEL(l) .L##l##_avx # define GLABEL(l) l##_avx # define USE_AVX 1 diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S new file mode 100644 index 0000000..fe70abd --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S @@ -0,0 +1,4 @@ +#define USE_AVX2 +#define AS_STRNCMP +#define __strcmp_sse2_unaligned __strncmp_avx2 +#include "strcmp-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S new file mode 100644 index 0000000..d987b28 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S @@ -0,0 +1,3 @@ +#define AS_STRNCMP +#define __strcmp_sse2_unaligned __strncmp_sse2_unaligned +#include "strcmp-sse2-unaligned.S"