From patchwork Fri Jun 19 15:53:04 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: Ondrej Bilka X-Patchwork-Id: 7258 Received: (qmail 116981 invoked by alias); 19 Jun 2015 15:53:32 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 116970 invoked by uid 89); 19 Jun 2015 15:53:31 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=2.1 required=5.0 tests=AWL, BAYES_99, BAYES_999, FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2 X-HELO: popelka.ms.mff.cuni.cz Date: Fri, 19 Jun 2015 17:53:04 +0200 From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= To: libc-alpha@sourceware.org Subject: Re: [PATCH v2 neleai/string-x64] Improve memcmp performance and fix regression. Message-ID: <20150619155304.GA26278@domone> References: <20150618080910.GA27306@domone> MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: <20150618080910.GA27306@domone> User-Agent: Mutt/1.5.20 (2009-06-14) On Thu, Jun 18, 2015 at 10:09:10AM +0200, Ondřej Bílka wrote: > Hi, > > As I sumbitted before in 2013 memcmp improvement here is new version > that improves performance a bit more. > > Also when I browsed results I found that memcmp-sse4 is in fact > regression for i7 nehalem, ivy bridge and haswell architectures. There > its beaten by old sse2 code by more than 10%. > > Main idea of new implementation is same, problem with performance is > that lot inputs were identical with small n. > For that I found that following approach gives best performance when > n<64 is likely. > > if (!cross_page (s1) && !cross_page (s2)) > { > mask = get_mask(EQ(EQ(LOAD(s1),LOAD(s2)),zero)) > mask2 = mask & (2 << (n-1)); > if (mask2) > return s1[first_byte(mask2)]-s2[first_byte(mask2)]; > if (n<=16) > return 0; > mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 16; > mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 32; > mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 48; > mask2 = mask & (2 << (n-1)); > if (mask2) > return s1[first_byte(mask2)]-s2[first_byte(mask2)]; > if (n<=64) > return 0; > if (mask) > return s1[first_byte(mask)]-s2[first_byte(mask)]; > } > > I didn't checked yet using just registers and byteswap to eliminate need > of getting exact byte position as I wrote in related thread. > > I could improve this bit more, I lose lot of cycles in loop ending > conditions. Problem is that I need to handle that unaligned s2 may read > from next page, I would need to add more complicated logic to compute > number of loop iterations. > > Thats related to avx2. I as RFC included it but it harm performance on > haswell. > > Last is wmemcmp that I would also need to convert, now I just moved > memcmp-sse-4 there. > > A profile is found here. > > http://kam.mff.cuni.cz/~ondra/benchmark_string/memcmp_profile.html > I updated that new version. I removed avx2 for now, I will submit it when I find how it could improve performance. Second change is that I added wmemcmp conditionals so now I could delete memcmp-sse4 and wmemcmp-sse4. * sysdeps/x86_64/memcmp.S: New implementation. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Remove memcmp-sse4 * sysdeps/x86_64/multiarch/Makefile(routines): Remove memcmp-sse4. * sysdeps/x86_64/multiarch/memcmp.S: Likewise. * sysdeps/x86_64/multiarch/memcmp-sse4.S: Removed. * sysdeps/x86_64/multiarch/wmemcmp-sse4.S: Likewise. --- sysdeps/x86_64/memcmp.S | 495 ++++++++--------------- sysdeps/x86_64/multiarch/Makefile | 6 +- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 9 +- sysdeps/x86_64/multiarch/memcmp-avx2.S | 3 + sysdeps/x86_64/multiarch/memcmp.S | 25 +- sysdeps/x86_64/multiarch/wmemcmp-sse4.S | 4 - sysdeps/x86_64/multiarch/wmemcmp.S | 12 +- 8 files changed, 203 insertions(+), 360 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2.S delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse4.S diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S index f636716..55377fe 100644 --- a/sysdeps/x86_64/memcmp.S +++ b/sysdeps/x86_64/memcmp.S @@ -19,340 +19,185 @@ #include +#ifndef MEMCMP +# define MEMCMP memcmp +#endif + .text -ENTRY (memcmp) - test %rdx, %rdx - jz L(finz) - cmpq $1, %rdx - jle L(finr1b) - subq %rdi, %rsi - movq %rdx, %r10 - cmpq $32, %r10 - jge L(gt32) - /* Handle small chunks and last block of less than 32 bytes. */ -L(small): - testq $1, %r10 - jz L(s2b) - movzbl (%rdi), %eax - movzbl (%rdi, %rsi), %edx - subq $1, %r10 - je L(finz1) - addq $1, %rdi - subl %edx, %eax - jnz L(exit) -L(s2b): - testq $2, %r10 - jz L(s4b) - movzwl (%rdi), %eax - movzwl (%rdi, %rsi), %edx - subq $2, %r10 - je L(fin2_7) - addq $2, %rdi - cmpl %edx, %eax - jnz L(fin2_7) -L(s4b): - testq $4, %r10 - jz L(s8b) - movl (%rdi), %eax - movl (%rdi, %rsi), %edx - subq $4, %r10 - je L(fin2_7) - addq $4, %rdi - cmpl %edx, %eax - jnz L(fin2_7) -L(s8b): - testq $8, %r10 - jz L(s16b) - movq (%rdi), %rax - movq (%rdi, %rsi), %rdx - subq $8, %r10 - je L(fin2_7) - addq $8, %rdi - cmpq %rdx, %rax - jnz L(fin2_7) -L(s16b): - movdqu (%rdi), %xmm1 - movdqu (%rdi, %rsi), %xmm0 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - xorl %eax, %eax - subl $0xffff, %edx - jz L(finz) - bsfl %edx, %ecx - leaq (%rdi, %rcx), %rcx - movzbl (%rcx), %eax - movzbl (%rsi, %rcx), %edx - jmp L(finz1) +ENTRY (MEMCMP) + testq %rdx, %rdx + je L(return_zero) +#ifdef AS_WMEMCMP + shl $2, %rdx +#endif + pxor %xmm4, %xmm4 + movl %edi, %eax + andl $4095, %eax + cmpl $4032, %eax + jg L(cross_page) +L(handle_end): + movl %esi, %eax + andl $4095, %eax + cmpl $4032, %eax + jg L(cross_page) + movdqu (%rdi), %xmm0 + lea -1(%edx), %ecx + movl $2, %eax + movdqu (%rsi), %xmm1 + salq %cl, %rax + leaq -1(%rax), %rcx + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + and %ecx, %eax + jne L(different) + cmpq $16, %rdx + ja L(next) + ret +L(next): + pmovmskb %xmm0, %r8d + movdqu 16(%rdi), %xmm2 + movdqu 16(%rsi), %xmm6 + movdqu 32(%rdi), %xmm1 + pcmpeqb %xmm6, %xmm2 + movdqu 32(%rsi), %xmm5 + pcmpeqb %xmm4, %xmm2 + pcmpeqb %xmm5, %xmm1 + movdqu 48(%rdi), %xmm7 + pmovmskb %xmm2, %eax + movdqu 48(%rsi), %xmm3 + pcmpeqb %xmm4, %xmm1 + pmovmskb %xmm1, %r9d + sal $16, %eax + pcmpeqb %xmm3, %xmm7 + salq $32, %r9 + pcmpeqb %xmm4, %xmm7 + orq %r9, %rax + orq %r8, %rax + pmovmskb %xmm7, %r8d + salq $48, %r8 + orq %r8, %rax + movq %rax, %r8 + andq %rcx, %rax + jne L(different) + cmpq $64, %rdx + jbe L(return_zero) + movq %r8, %rax + testq %rax, %rax + jne L(different) +L(align_loop): + leaq 64(%rdi), %rax + andq $-64, %rax + subq %rdi, %rax + subq %rax, %rdx + addq %rax, %rdi + addq %rax, %rsi + cmpq $64, %rdx + ja L(loop_start) + testq %rdx, %rdx + jne L(handle_end) + xorl %eax, %eax + ret - .p2align 4,, 4 -L(finr1b): - movzbl (%rdi), %eax - movzbl (%rsi), %edx -L(finz1): + .p2align 4 +L(different): + bsfq %rax, %rdx +#ifdef AS_WMEMCMP + and $-4, %rdx + mov (%rdi,%rdx), %eax + mov (%rsi,%rdx), %edx subl %edx, %eax -L(exit): + jg L(ret1) + jl L(ret_neg_1) ret - - .p2align 4,, 4 -L(fin2_7): - cmpq %rdx, %rax - jz L(finz) - movq %rax, %r11 - subq %rdx, %r11 - bsfq %r11, %rcx - sarq $3, %rcx - salq $3, %rcx - sarq %cl, %rax - movzbl %al, %eax - sarq %cl, %rdx - movzbl %dl, %edx +L(ret1): + mov $1, %eax + ret +L(ret_neg_1): + mov $-1, %eax + ret +#else + movzbl (%rdi,%rdx), %eax + movzbl (%rsi,%rdx), %edx subl %edx, %eax ret - - .p2align 4,, 4 -L(finz): +#endif + + .p2align 4 +L(loop): + subq $64, %rdx + addq $64, %rdi + addq $64, %rsi + cmpq $64, %rdx + jbe L(less_64_bytes) +L(loop_start): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm1 + pcmpeqb (%rdi), %xmm0 + movdqu 32(%rsi), %xmm2 + pcmpeqb 16(%rdi), %xmm1 + movdqu 48(%rsi), %xmm3 + pcmpeqb 32(%rdi), %xmm2 + pcmpeqb 48(%rdi), %xmm3 + pminub %xmm0, %xmm3 + pminub %xmm1, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm4, %xmm3 + pmovmskb %xmm3, %eax + testl %eax, %eax + je L(loop) + shl $48, %rax + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm4, %xmm1 + pcmpeqb %xmm4, %xmm2 + pmovmskb %xmm0, %r8 + pmovmskb %xmm1, %rcx + pmovmskb %xmm2, %r9 + shl $16, %ecx + shl $32, %r9 + or %r8, %rax + or %r9, %rax + or %rcx, %rax + jmp L(different) + + .p2align 4 +L(less_64_bytes): + testq %rdx, %rdx + jne L(handle_end) xorl %eax, %eax ret - /* For blocks bigger than 32 bytes - 1. Advance one of the addr pointer to be 16B aligned. - 2. Treat the case of both addr pointers aligned to 16B - separately to avoid movdqu. - 3. Handle any blocks of greater than 64 consecutive bytes with - unrolling to reduce branches. - 4. At least one addr pointer is 16B aligned, use memory version - of pcmbeqb. - */ - .p2align 4,, 4 -L(gt32): - movq %rdx, %r11 - addq %rdi, %r11 - movq %rdi, %r8 - - andq $15, %r8 - jz L(16am) - /* Both pointers may be misaligned. */ - movdqu (%rdi), %xmm1 - movdqu (%rdi, %rsi), %xmm0 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - subl $0xffff, %edx - jnz L(neq) - neg %r8 - leaq 16(%rdi, %r8), %rdi -L(16am): - /* Handle two 16B aligned pointers separately. */ - testq $15, %rsi - jz L(ATR) - testq $16, %rdi - jz L(A32) - movdqu (%rdi, %rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi -L(A32): - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jge L(mt16) - /* Pre-unroll to be ready for unrolled 64B loop. */ - testq $32, %rdi - jz L(A64) - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - -L(A64): - movq %r11, %r10 - andq $-64, %r10 - cmpq %r10, %rdi - jge L(mt32) - -L(A64main): - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - cmpq %rdi, %r10 - jne L(A64main) - -L(mt32): - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jge L(mt16) - -L(A32main): - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - cmpq %rdi, %r10 - jne L(A32main) -L(mt16): - subq %rdi, %r11 - je L(finz) - movq %r11, %r10 - jmp L(small) - - .p2align 4,, 4 -L(neq): - bsfl %edx, %ecx - movzbl (%rdi, %rcx), %eax - addq %rdi, %rsi - movzbl (%rsi,%rcx), %edx - jmp L(finz1) - - .p2align 4,, 4 -L(ATR): - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jge L(mt16) - testq $16, %rdi - jz L(ATR32) - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - cmpq %rdi, %r10 - je L(mt16) - -L(ATR32): - movq %r11, %r10 - andq $-64, %r10 - testq $32, %rdi - jz L(ATR64) - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - -L(ATR64): - cmpq %rdi, %r10 - je L(mt32) - -L(ATR64main): - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - cmpq %rdi, %r10 - jne L(ATR64main) - - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jge L(mt16) - -L(ATR32res): - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - cmpq %r10, %rdi - jne L(ATR32res) - - subq %rdi, %r11 - je L(finz) - movq %r11, %r10 - jmp L(small) - /* Align to 16byte to improve instruction fetch. */ - .p2align 4,, 4 -END(memcmp) + .p2align 4 +L(cross_page): + testq %rdx, %rdx + je L(return_zero) + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + cmpb %cl, %al + jne L(cross_page_different) + movl $1, %r8d + jmp L(cross_page_loop_start) + + .p2align 4 +L(cross_page_loop): + movzbl (%rdi,%r8), %eax + movzbl (%rsi,%r8), %ecx + cmpb %cl, %al + jne L(cross_page_different) + addq $1, %r8 + cmpq $65, %r8 + je L(align_loop) +L(cross_page_loop_start): + cmpq %rdx, %r8 + jne L(cross_page_loop) +L(return_zero): + xorl %eax, %eax + ret +L(cross_page_different): + subl %ecx, %eax + ret +END(MEMCMP) -#undef bcmp +#undef bcmp weak_alias (memcmp, bcmp) libc_hidden_builtin_def (memcmp) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index c573744..679db2a 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -8,7 +8,7 @@ ifeq ($(subdir),string) sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strcmp-sse2-unaligned strncmp-ssse3 \ - memcmp-sse4 memcpy-ssse3 \ + memcpy-ssse3 \ memcpy-sse2-unaligned mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \ @@ -29,10 +29,10 @@ CFLAGS-strspn-c.c += -msse4 endif ifeq (yes,$(config-cflags-avx2)) -sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 +sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 endif endif ifeq ($(subdir),wcsmbs) -sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c +sysdep_routines += wmemcmp-sse2-unaligned wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index d398e43..b3dbe65 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -39,10 +39,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/memcmp.S. */ IFUNC_IMPL (i, name, memcmp, - IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1, - __memcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, memcmp, HAS_AVX2, __memcmp_avx2) IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3) - IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2_unaligned)) /* Support sysdeps/x86_64/multiarch/memmove_chk.S. */ IFUNC_IMPL (i, name, __memmove_chk, @@ -211,8 +210,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wmemcmp.S. */ IFUNC_IMPL (i, name, wmemcmp, - IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSE4_1, - __wmemcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, wmemcmp, 1, + __wmemcmp_sse2_unaligned) IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSSE3, __wmemcmp_ssse3) IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S new file mode 100644 index 0000000..60483bf --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S @@ -0,0 +1,3 @@ +#define USE_AVX2 +#define MEMCMP __memcmp_avx2 +#include "../memcmp.S" diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S index f8b4636..5d87a17 100644 --- a/sysdeps/x86_64/multiarch/memcmp.S +++ b/sysdeps/x86_64/multiarch/memcmp.S @@ -29,33 +29,28 @@ ENTRY(memcmp) cmpl $0, KIND_OFFSET+__cpu_features(%rip) jne 1f call __init_cpu_features - -1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) jnz 2f - leaq __memcmp_sse2(%rip), %rax - ret - -2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) - jz 3f - leaq __memcmp_sse4_1(%rip), %rax +1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + jnz 3f +2: leaq __memcmp_sse2_unaligned(%rip), %rax ret 3: leaq __memcmp_ssse3(%rip), %rax ret - END(memcmp) # undef ENTRY # define ENTRY(name) \ - .type __memcmp_sse2, @function; \ + .type __memcmp_sse2_unaligned, @function; \ .p2align 4; \ - .globl __memcmp_sse2; \ - .hidden __memcmp_sse2; \ - __memcmp_sse2: cfi_startproc; \ + .globl __memcmp_sse2_unaligned; \ + .hidden __memcmp_sse2_unaligned; \ + __memcmp_sse2_unaligned: cfi_startproc; \ CALL_MCOUNT # undef END # define END(name) \ - cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2 + cfi_endproc; .size __memcmp_sse2_unaligned, .-__memcmp_sse2_unaligned # ifdef SHARED # undef libc_hidden_builtin_def @@ -63,7 +58,7 @@ END(memcmp) they will be called without setting up EBX needed for PLT which is used by IFUNC. */ # define libc_hidden_builtin_def(name) \ - .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2 + .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2_unaligned # endif #endif diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S deleted file mode 100644 index b07973a..0000000 --- a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_WMEMCMP 1 -#define MEMCMP __wmemcmp_sse4_1 - -#include "memcmp-sse4.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S index 109e245..dabd3ed 100644 --- a/sysdeps/x86_64/multiarch/wmemcmp.S +++ b/sysdeps/x86_64/multiarch/wmemcmp.S @@ -30,18 +30,16 @@ ENTRY(wmemcmp) jne 1f call __init_cpu_features -1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) jnz 2f - leaq __wmemcmp_sse2(%rip), %rax - ret - -2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) - jz 3f - leaq __wmemcmp_sse4_1(%rip), %rax +1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + jnz 3f +2: leaq __wmemcmp_sse2_unaligned(%rip), %rax ret 3: leaq __wmemcmp_ssse3(%rip), %rax ret + END(wmemcmp) #endif