[v1,1/6] x86: Remove {w}memcmp-ssse3
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result its no longer with the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 -
sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 --------------------
sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 -
5 files changed, 2006 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
Comments
On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
> sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 -
> sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 --------------------
> sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 -
> 5 files changed, 2006 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 6507d1b7fa..51222dfab1 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -12,7 +12,6 @@ sysdep_routines += \
> memcmp-evex-movbe \
> memcmp-sse2 \
> memcmp-sse4 \
> - memcmp-ssse3 \
> memcmpeq-avx2 \
> memcmpeq-avx2-rtm \
> memcmpeq-evex \
> @@ -179,7 +178,6 @@ sysdep_routines += \
> wmemcmp-c \
> wmemcmp-evex-movbe \
> wmemcmp-sse4 \
> - wmemcmp-ssse3 \
> # sysdep_routines
> endif
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 40cc6cc49e..f389928a4e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __memcmp_evex_movbe)
> IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
> __memcmp_sse4_1)
> - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
> - __memcmp_ssse3)
> IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>
> #ifdef SHARED
> @@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __wmemcmp_evex_movbe)
> IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
> __wmemcmp_sse4_1)
> - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
> - __wmemcmp_ssse3)
> IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
>
> /* Support sysdeps/x86_64/multiarch/wmemset.c. */
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> index cd12613699..44759a3ad5 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> @@ -20,7 +20,6 @@
> # include <init-arch.h>
>
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
> @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
> if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
> return OPTIMIZE (sse4_1);
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> - return OPTIMIZE (ssse3);
> -
> return OPTIMIZE (sse2);
> }
> diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
> deleted file mode 100644
> index df1b1fc494..0000000000
> --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
> +++ /dev/null
> @@ -1,1992 +0,0 @@
> -/* memcmp with SSSE3, wmemcmp with SSSE3
> - Copyright (C) 2011-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#if IS_IN (libc)
> -
> -# include <sysdep.h>
> -
> -# ifndef MEMCMP
> -# define MEMCMP __memcmp_ssse3
> -# endif
> -
> -/* Warning!
> - wmemcmp has to use SIGNED comparison for elements.
> - memcmp has to use UNSIGNED comparison for elemnts.
> -*/
> -
> - atom_text_section
> -ENTRY (MEMCMP)
> -# ifdef USE_AS_WMEMCMP
> - shl $2, %RDX_LP
> - test %RDX_LP, %RDX_LP
> - jz L(equal)
> -# elif defined __ILP32__
> - /* Clear the upper 32 bits. */
> - mov %edx, %edx
> -# endif
> - mov %rdx, %rcx
> - mov %rdi, %rdx
> - cmp $48, %rcx;
> - jae L(48bytesormore) /* LEN => 48 */
> -
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -/* ECX >= 32. */
> -L(48bytesormore):
> - movdqu (%rdi), %xmm3
> - movdqu (%rsi), %xmm0
> - pcmpeqb %xmm0, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 16(%rdi), %rdi
> - lea 16(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(less16bytes)
> - mov %edi, %edx
> - and $0xf, %edx
> - xor %rdx, %rdi
> - sub %rdx, %rsi
> - add %rdx, %rcx
> - mov %esi, %edx
> - and $0xf, %edx
> - jz L(shr_0)
> - xor %rdx, %rsi
> -
> -# ifndef USE_AS_WMEMCMP
> - cmp $8, %edx
> - jae L(next_unaligned_table)
> - cmp $0, %edx
> - je L(shr_0)
> - cmp $1, %edx
> - je L(shr_1)
> - cmp $2, %edx
> - je L(shr_2)
> - cmp $3, %edx
> - je L(shr_3)
> - cmp $4, %edx
> - je L(shr_4)
> - cmp $5, %edx
> - je L(shr_5)
> - cmp $6, %edx
> - je L(shr_6)
> - jmp L(shr_7)
> -
> - .p2align 2
> -L(next_unaligned_table):
> - cmp $8, %edx
> - je L(shr_8)
> - cmp $9, %edx
> - je L(shr_9)
> - cmp $10, %edx
> - je L(shr_10)
> - cmp $11, %edx
> - je L(shr_11)
> - cmp $12, %edx
> - je L(shr_12)
> - cmp $13, %edx
> - je L(shr_13)
> - cmp $14, %edx
> - je L(shr_14)
> - jmp L(shr_15)
> -# else
> - cmp $0, %edx
> - je L(shr_0)
> - cmp $4, %edx
> - je L(shr_4)
> - cmp $8, %edx
> - je L(shr_8)
> - jmp L(shr_12)
> -# endif
> -
> - .p2align 4
> -L(shr_0):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - jae L(shr_0_gobble)
> - xor %eax, %eax
> - movdqa (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> - movdqa 16(%rsi), %xmm2
> - pcmpeqb 16(%rdi), %xmm2
> - pand %xmm1, %xmm2
> - pmovmskb %xmm2, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_0_gobble):
> - movdqa (%rsi), %xmm0
> - xor %eax, %eax
> - pcmpeqb (%rdi), %xmm0
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm2
> - pcmpeqb 16(%rdi), %xmm2
> -L(shr_0_gobble_loop):
> - pand %xmm0, %xmm2
> - sub $32, %rcx
> - pmovmskb %xmm2, %edx
> - movdqa %xmm0, %xmm1
> - movdqa 32(%rsi), %xmm0
> - movdqa 48(%rsi), %xmm2
> - sbb $0xffff, %edx
> - pcmpeqb 32(%rdi), %xmm0
> - pcmpeqb 48(%rdi), %xmm2
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - jz L(shr_0_gobble_loop)
> -
> - pand %xmm0, %xmm2
> - cmp $0, %rcx
> - jge L(next)
> - inc %edx
> - add $32, %rcx
> -L(next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm2, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> - .p2align 4
> -L(shr_1):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_1_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $1, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $1, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $1, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_1_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $1, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $1, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_1_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $1, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $1, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_1_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_1_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_1_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 1(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -
> - .p2align 4
> -L(shr_2):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_2_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $2, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $2, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $2, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_2_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $2, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $2, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_2_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $2, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $2, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_2_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_2_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_2_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 2(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_3):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_3_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $3, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $3, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $3, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_3_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $3, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $3, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_3_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $3, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $3, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_3_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_3_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_3_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 3(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# endif
> -
> - .p2align 4
> -L(shr_4):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_4_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $4, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $4, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $4, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_4_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $4, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $4, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_4_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $4, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $4, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_4_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_4_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_4_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 4(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> - .p2align 4
> -L(shr_5):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_5_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $5, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $5, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $5, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_5_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $5, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $5, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_5_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $5, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $5, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_5_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_5_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_5_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 5(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_6):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_6_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $6, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $6, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $6, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_6_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $6, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $6, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_6_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $6, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $6, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_6_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_6_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_6_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 6(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_7):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_7_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $7, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $7, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $7, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_7_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $7, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $7, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_7_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $7, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $7, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_7_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_7_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_7_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 7(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# endif
> -
> - .p2align 4
> -L(shr_8):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_8_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $8, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $8, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $8, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_8_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $8, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $8, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_8_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $8, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $8, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_8_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_8_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_8_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 8(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> - .p2align 4
> -L(shr_9):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_9_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $9, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $9, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $9, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_9_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $9, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $9, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_9_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $9, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $9, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_9_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_9_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_9_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 9(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_10):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_10_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $10, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $10, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $10, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_10_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $10, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $10, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_10_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $10, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $10, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_10_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_10_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_10_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 10(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_11):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_11_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $11, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $11, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $11, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_11_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $11, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $11, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_11_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $11, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $11, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_11_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_11_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_11_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 11(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# endif
> -
> - .p2align 4
> -L(shr_12):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_12_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $12, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $12, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $12, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_12_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $12, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $12, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_12_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $12, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $12, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_12_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_12_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_12_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 12(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> - .p2align 4
> -L(shr_13):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_13_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $13, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $13, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $13, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_13_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $13, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $13, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_13_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $13, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $13, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_13_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_13_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_13_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 13(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_14):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_14_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $14, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $14, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $14, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_14_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $14, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $14, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_14_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $14, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $14, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_14_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_14_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_14_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 14(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_15):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_15_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $15, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $15, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $15, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_15_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $15, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $15, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_15_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $15, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $15, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_15_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_15_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_15_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 15(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -# endif
> - .p2align 4
> -L(exit):
> - pmovmskb %xmm1, %r8d
> - sub $0xffff, %r8d
> - jz L(first16bytes)
> - lea -16(%rsi), %rsi
> - lea -16(%rdi), %rdi
> - mov %r8d, %edx
> -L(first16bytes):
> - add %rax, %rsi
> -L(less16bytes):
> -# ifndef USE_AS_WMEMCMP
> - test %dl, %dl
> - jz L(next_24_bytes)
> -
> - test $0x01, %dl
> - jnz L(Byte16)
> -
> - test $0x02, %dl
> - jnz L(Byte17)
> -
> - test $0x04, %dl
> - jnz L(Byte18)
> -
> - test $0x08, %dl
> - jnz L(Byte19)
> -
> - test $0x10, %dl
> - jnz L(Byte20)
> -
> - test $0x20, %dl
> - jnz L(Byte21)
> -
> - test $0x40, %dl
> - jnz L(Byte22)
> -
> - movzbl -9(%rdi), %eax
> - movzbl -9(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte16):
> - movzbl -16(%rdi), %eax
> - movzbl -16(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte17):
> - movzbl -15(%rdi), %eax
> - movzbl -15(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte18):
> - movzbl -14(%rdi), %eax
> - movzbl -14(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte19):
> - movzbl -13(%rdi), %eax
> - movzbl -13(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte20):
> - movzbl -12(%rdi), %eax
> - movzbl -12(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte21):
> - movzbl -11(%rdi), %eax
> - movzbl -11(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte22):
> - movzbl -10(%rdi), %eax
> - movzbl -10(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(next_24_bytes):
> - lea 8(%rdi), %rdi
> - lea 8(%rsi), %rsi
> - test $0x01, %dh
> - jnz L(Byte16)
> -
> - test $0x02, %dh
> - jnz L(Byte17)
> -
> - test $0x04, %dh
> - jnz L(Byte18)
> -
> - test $0x08, %dh
> - jnz L(Byte19)
> -
> - test $0x10, %dh
> - jnz L(Byte20)
> -
> - test $0x20, %dh
> - jnz L(Byte21)
> -
> - test $0x40, %dh
> - jnz L(Byte22)
> -
> - movzbl -9(%rdi), %eax
> - movzbl -9(%rsi), %edx
> - sub %edx, %eax
> - ret
> -# else
> -/* special for wmemcmp */
> - xor %eax, %eax
> - test %dl, %dl
> - jz L(next_two_double_words)
> - and $15, %dl
> - jz L(second_double_word)
> - mov -16(%rdi), %eax
> - cmp -16(%rsi), %eax
> - jne L(find_diff)
> - ret
> -
> - .p2align 4
> -L(second_double_word):
> - mov -12(%rdi), %eax
> - cmp -12(%rsi), %eax
> - jne L(find_diff)
> - ret
> -
> - .p2align 4
> -L(next_two_double_words):
> - and $15, %dh
> - jz L(fourth_double_word)
> - mov -8(%rdi), %eax
> - cmp -8(%rsi), %eax
> - jne L(find_diff)
> - ret
> -
> - .p2align 4
> -L(fourth_double_word):
> - mov -4(%rdi), %eax
> - cmp -4(%rsi), %eax
> - jne L(find_diff)
> - ret
> -# endif
> -
> - .p2align 4
> -L(less48bytes):
> - cmp $8, %ecx
> - jae L(more8bytes)
> - cmp $0, %ecx
> - je L(0bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $1, %ecx
> - je L(1bytes)
> - cmp $2, %ecx
> - je L(2bytes)
> - cmp $3, %ecx
> - je L(3bytes)
> - cmp $4, %ecx
> - je L(4bytes)
> - cmp $5, %ecx
> - je L(5bytes)
> - cmp $6, %ecx
> - je L(6bytes)
> - jmp L(7bytes)
> -# else
> - jmp L(4bytes)
> -# endif
> -
> - .p2align 4
> -L(more8bytes):
> - cmp $16, %ecx
> - jae L(more16bytes)
> - cmp $8, %ecx
> - je L(8bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $9, %ecx
> - je L(9bytes)
> - cmp $10, %ecx
> - je L(10bytes)
> - cmp $11, %ecx
> - je L(11bytes)
> - cmp $12, %ecx
> - je L(12bytes)
> - cmp $13, %ecx
> - je L(13bytes)
> - cmp $14, %ecx
> - je L(14bytes)
> - jmp L(15bytes)
> -# else
> - jmp L(12bytes)
> -# endif
> -
> - .p2align 4
> -L(more16bytes):
> - cmp $24, %ecx
> - jae L(more24bytes)
> - cmp $16, %ecx
> - je L(16bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $17, %ecx
> - je L(17bytes)
> - cmp $18, %ecx
> - je L(18bytes)
> - cmp $19, %ecx
> - je L(19bytes)
> - cmp $20, %ecx
> - je L(20bytes)
> - cmp $21, %ecx
> - je L(21bytes)
> - cmp $22, %ecx
> - je L(22bytes)
> - jmp L(23bytes)
> -# else
> - jmp L(20bytes)
> -# endif
> -
> - .p2align 4
> -L(more24bytes):
> - cmp $32, %ecx
> - jae L(more32bytes)
> - cmp $24, %ecx
> - je L(24bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $25, %ecx
> - je L(25bytes)
> - cmp $26, %ecx
> - je L(26bytes)
> - cmp $27, %ecx
> - je L(27bytes)
> - cmp $28, %ecx
> - je L(28bytes)
> - cmp $29, %ecx
> - je L(29bytes)
> - cmp $30, %ecx
> - je L(30bytes)
> - jmp L(31bytes)
> -# else
> - jmp L(28bytes)
> -# endif
> -
> - .p2align 4
> -L(more32bytes):
> - cmp $40, %ecx
> - jae L(more40bytes)
> - cmp $32, %ecx
> - je L(32bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $33, %ecx
> - je L(33bytes)
> - cmp $34, %ecx
> - je L(34bytes)
> - cmp $35, %ecx
> - je L(35bytes)
> - cmp $36, %ecx
> - je L(36bytes)
> - cmp $37, %ecx
> - je L(37bytes)
> - cmp $38, %ecx
> - je L(38bytes)
> - jmp L(39bytes)
> -# else
> - jmp L(36bytes)
> -# endif
> -
> - .p2align 4
> -L(more40bytes):
> - cmp $40, %ecx
> - je L(40bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $41, %ecx
> - je L(41bytes)
> - cmp $42, %ecx
> - je L(42bytes)
> - cmp $43, %ecx
> - je L(43bytes)
> - cmp $44, %ecx
> - je L(44bytes)
> - cmp $45, %ecx
> - je L(45bytes)
> - cmp $46, %ecx
> - je L(46bytes)
> - jmp L(47bytes)
> -
> - .p2align 4
> -L(44bytes):
> - movl -44(%rdi), %eax
> - movl -44(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(40bytes):
> - movl -40(%rdi), %eax
> - movl -40(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(36bytes):
> - movl -36(%rdi), %eax
> - movl -36(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(32bytes):
> - movl -32(%rdi), %eax
> - movl -32(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(28bytes):
> - movl -28(%rdi), %eax
> - movl -28(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(24bytes):
> - movl -24(%rdi), %eax
> - movl -24(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(20bytes):
> - movl -20(%rdi), %eax
> - movl -20(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(16bytes):
> - movl -16(%rdi), %eax
> - movl -16(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(12bytes):
> - movl -12(%rdi), %eax
> - movl -12(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(8bytes):
> - movl -8(%rdi), %eax
> - movl -8(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(4bytes):
> - movl -4(%rdi), %eax
> - movl -4(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(0bytes):
> - xor %eax, %eax
> - ret
> -# else
> - .p2align 4
> -L(44bytes):
> - movl -44(%rdi), %eax
> - cmp -44(%rsi), %eax
> - jne L(find_diff)
> -L(40bytes):
> - movl -40(%rdi), %eax
> - cmp -40(%rsi), %eax
> - jne L(find_diff)
> -L(36bytes):
> - movl -36(%rdi), %eax
> - cmp -36(%rsi), %eax
> - jne L(find_diff)
> -L(32bytes):
> - movl -32(%rdi), %eax
> - cmp -32(%rsi), %eax
> - jne L(find_diff)
> -L(28bytes):
> - movl -28(%rdi), %eax
> - cmp -28(%rsi), %eax
> - jne L(find_diff)
> -L(24bytes):
> - movl -24(%rdi), %eax
> - cmp -24(%rsi), %eax
> - jne L(find_diff)
> -L(20bytes):
> - movl -20(%rdi), %eax
> - cmp -20(%rsi), %eax
> - jne L(find_diff)
> -L(16bytes):
> - movl -16(%rdi), %eax
> - cmp -16(%rsi), %eax
> - jne L(find_diff)
> -L(12bytes):
> - movl -12(%rdi), %eax
> - cmp -12(%rsi), %eax
> - jne L(find_diff)
> -L(8bytes):
> - movl -8(%rdi), %eax
> - cmp -8(%rsi), %eax
> - jne L(find_diff)
> -L(4bytes):
> - movl -4(%rdi), %eax
> - cmp -4(%rsi), %eax
> - jne L(find_diff)
> -L(0bytes):
> - xor %eax, %eax
> - ret
> -# endif
> -
> -# ifndef USE_AS_WMEMCMP
> - .p2align 4
> -L(45bytes):
> - movl -45(%rdi), %eax
> - movl -45(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(41bytes):
> - movl -41(%rdi), %eax
> - movl -41(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(37bytes):
> - movl -37(%rdi), %eax
> - movl -37(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(33bytes):
> - movl -33(%rdi), %eax
> - movl -33(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(29bytes):
> - movl -29(%rdi), %eax
> - movl -29(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(25bytes):
> - movl -25(%rdi), %eax
> - movl -25(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(21bytes):
> - movl -21(%rdi), %eax
> - movl -21(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(17bytes):
> - movl -17(%rdi), %eax
> - movl -17(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(13bytes):
> - movl -13(%rdi), %eax
> - movl -13(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(9bytes):
> - movl -9(%rdi), %eax
> - movl -9(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(5bytes):
> - movl -5(%rdi), %eax
> - movl -5(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(1bytes):
> - movzbl -1(%rdi), %eax
> - cmpb -1(%rsi), %al
> - jne L(set)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(46bytes):
> - movl -46(%rdi), %eax
> - movl -46(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(42bytes):
> - movl -42(%rdi), %eax
> - movl -42(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(38bytes):
> - movl -38(%rdi), %eax
> - movl -38(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(34bytes):
> - movl -34(%rdi), %eax
> - movl -34(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(30bytes):
> - movl -30(%rdi), %eax
> - movl -30(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(26bytes):
> - movl -26(%rdi), %eax
> - movl -26(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(22bytes):
> - movl -22(%rdi), %eax
> - movl -22(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(18bytes):
> - movl -18(%rdi), %eax
> - movl -18(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(14bytes):
> - movl -14(%rdi), %eax
> - movl -14(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(10bytes):
> - movl -10(%rdi), %eax
> - movl -10(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(6bytes):
> - movl -6(%rdi), %eax
> - movl -6(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(2bytes):
> - movzwl -2(%rdi), %eax
> - movzwl -2(%rsi), %ecx
> - cmpb %cl, %al
> - jne L(set)
> - cmp %ecx, %eax
> - jne L(set)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(47bytes):
> - movl -47(%rdi), %eax
> - movl -47(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(43bytes):
> - movl -43(%rdi), %eax
> - movl -43(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(39bytes):
> - movl -39(%rdi), %eax
> - movl -39(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(35bytes):
> - movl -35(%rdi), %eax
> - movl -35(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(31bytes):
> - movl -31(%rdi), %eax
> - movl -31(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(27bytes):
> - movl -27(%rdi), %eax
> - movl -27(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(23bytes):
> - movl -23(%rdi), %eax
> - movl -23(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(19bytes):
> - movl -19(%rdi), %eax
> - movl -19(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(15bytes):
> - movl -15(%rdi), %eax
> - movl -15(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(11bytes):
> - movl -11(%rdi), %eax
> - movl -11(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(7bytes):
> - movl -7(%rdi), %eax
> - movl -7(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(3bytes):
> - movzwl -3(%rdi), %eax
> - movzwl -3(%rsi), %ecx
> - cmpb %cl, %al
> - jne L(set)
> - cmp %ecx, %eax
> - jne L(set)
> - movzbl -1(%rdi), %eax
> - cmpb -1(%rsi), %al
> - jne L(set)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(find_diff):
> - cmpb %cl, %al
> - jne L(set)
> - cmpw %cx, %ax
> - jne L(set)
> - shr $16, %eax
> - shr $16, %ecx
> - cmpb %cl, %al
> - jne L(set)
> -
> -/* We get there only if we already know there is a
> -difference. */
> -
> - cmp %ecx, %eax
> -L(set):
> - sbb %eax, %eax
> - sbb $-1, %eax
> - ret
> -# else
> -
> -/* for wmemcmp */
> - .p2align 4
> -L(find_diff):
> - mov $1, %eax
> - jg L(find_diff_bigger)
> - neg %eax
> - ret
> -
> - .p2align 4
> -L(find_diff_bigger):
> - ret
> -# endif
> -
> - .p2align 4
> -L(equal):
> - xor %eax, %eax
> - ret
> -
> -END (MEMCMP)
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
> deleted file mode 100644
> index a41ef95fc1..0000000000
> --- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_WMEMCMP 1
> -#define MEMCMP __wmemcmp_ssse3
> -
> -#include "memcmp-ssse3.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
On Mär 25 2022, Noah Goldstein via Libc-alpha wrote:
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
I think the second sentence is missing something. Also: s/its/it is/.
On Fri, Mar 25, 2022 at 3:34 PM Andreas Schwab <schwab@linux-m68k.org> wrote:
>
> On Mär 25 2022, Noah Goldstein via Libc-alpha wrote:
>
> > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> > SSSE3. As a result its no longer with the code size cost.
>
> I think the second sentence is missing something. Also: s/its/it is/.
^
Hows:
"As a result it is no longer worth it to keep the SSSE3 versions given
the code size cost."
>
> --
> Andreas Schwab, schwab@linux-m68k.org
> GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510 2552 DF73 E780 A9DA AEC1
> "And now for something completely different."
@@ -12,7 +12,6 @@ sysdep_routines += \
memcmp-evex-movbe \
memcmp-sse2 \
memcmp-sse4 \
- memcmp-ssse3 \
memcmpeq-avx2 \
memcmpeq-avx2-rtm \
memcmpeq-evex \
@@ -179,7 +178,6 @@ sysdep_routines += \
wmemcmp-c \
wmemcmp-evex-movbe \
wmemcmp-sse4 \
- wmemcmp-ssse3 \
# sysdep_routines
endif
@@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
__memcmp_sse4_1)
- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
- __memcmp_ssse3)
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
#ifdef SHARED
@@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wmemcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
__wmemcmp_sse4_1)
- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
- __wmemcmp_ssse3)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
/* Support sysdeps/x86_64/multiarch/wmemset.c. */
@@ -20,7 +20,6 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
return OPTIMIZE (sse4_1);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
deleted file mode 100644
@@ -1,1992 +0,0 @@
-/* memcmp with SSSE3, wmemcmp with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_ssse3
-# endif
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- atom_text_section
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %RDX_LP
- test %RDX_LP, %RDX_LP
- jz L(equal)
-# elif defined __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-# endif
- mov %rdx, %rcx
- mov %rdi, %rdx
- cmp $48, %rcx;
- jae L(48bytesormore) /* LEN => 48 */
-
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-/* ECX >= 32. */
-L(48bytesormore):
- movdqu (%rdi), %xmm3
- movdqu (%rsi), %xmm0
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
- lea 16(%rdi), %rdi
- lea 16(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(less16bytes)
- mov %edi, %edx
- and $0xf, %edx
- xor %rdx, %rdi
- sub %rdx, %rsi
- add %rdx, %rcx
- mov %esi, %edx
- and $0xf, %edx
- jz L(shr_0)
- xor %rdx, %rsi
-
-# ifndef USE_AS_WMEMCMP
- cmp $8, %edx
- jae L(next_unaligned_table)
- cmp $0, %edx
- je L(shr_0)
- cmp $1, %edx
- je L(shr_1)
- cmp $2, %edx
- je L(shr_2)
- cmp $3, %edx
- je L(shr_3)
- cmp $4, %edx
- je L(shr_4)
- cmp $5, %edx
- je L(shr_5)
- cmp $6, %edx
- je L(shr_6)
- jmp L(shr_7)
-
- .p2align 2
-L(next_unaligned_table):
- cmp $8, %edx
- je L(shr_8)
- cmp $9, %edx
- je L(shr_9)
- cmp $10, %edx
- je L(shr_10)
- cmp $11, %edx
- je L(shr_11)
- cmp $12, %edx
- je L(shr_12)
- cmp $13, %edx
- je L(shr_13)
- cmp $14, %edx
- je L(shr_14)
- jmp L(shr_15)
-# else
- cmp $0, %edx
- je L(shr_0)
- cmp $4, %edx
- je L(shr_4)
- cmp $8, %edx
- je L(shr_8)
- jmp L(shr_12)
-# endif
-
- .p2align 4
-L(shr_0):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- jae L(shr_0_gobble)
- xor %eax, %eax
- movdqa (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
- movdqa 16(%rsi), %xmm2
- pcmpeqb 16(%rdi), %xmm2
- pand %xmm1, %xmm2
- pmovmskb %xmm2, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_0_gobble):
- movdqa (%rsi), %xmm0
- xor %eax, %eax
- pcmpeqb (%rdi), %xmm0
- sub $32, %rcx
- movdqa 16(%rsi), %xmm2
- pcmpeqb 16(%rdi), %xmm2
-L(shr_0_gobble_loop):
- pand %xmm0, %xmm2
- sub $32, %rcx
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- movdqa 32(%rsi), %xmm0
- movdqa 48(%rsi), %xmm2
- sbb $0xffff, %edx
- pcmpeqb 32(%rdi), %xmm0
- pcmpeqb 48(%rdi), %xmm2
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- jz L(shr_0_gobble_loop)
-
- pand %xmm0, %xmm2
- cmp $0, %rcx
- jge L(next)
- inc %edx
- add $32, %rcx
-L(next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_1):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_1_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $1, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $1, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $1, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_1_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $1, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $1, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_1_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $1, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $1, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_1_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_1_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_1_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 1(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-
- .p2align 4
-L(shr_2):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_2_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $2, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $2, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $2, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_2_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $2, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $2, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_2_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $2, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $2, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_2_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_2_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_2_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 2(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_3):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_3_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $3, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $3, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $3, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_3_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $3, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $3, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_3_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $3, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $3, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_3_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_3_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_3_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 3(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_4):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_4_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $4, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $4, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $4, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_4_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $4, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $4, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_4_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $4, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $4, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_4_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_4_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_4_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 4(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_5):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_5_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $5, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $5, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $5, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_5_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $5, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $5, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_5_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $5, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $5, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_5_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_5_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_5_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 5(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_6):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_6_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $6, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $6, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $6, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_6_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $6, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $6, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_6_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $6, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $6, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_6_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_6_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_6_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 6(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_7):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_7_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $7, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $7, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $7, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_7_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $7, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $7, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_7_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $7, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $7, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_7_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_7_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_7_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 7(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_8):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_8_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $8, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $8, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $8, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_8_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $8, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $8, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_8_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $8, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $8, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_8_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_8_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_8_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 8(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_9):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_9_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $9, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $9, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $9, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_9_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $9, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $9, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_9_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $9, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $9, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_9_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_9_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_9_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 9(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_10):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_10_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $10, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $10, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $10, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_10_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $10, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $10, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_10_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $10, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $10, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_10_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_10_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_10_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 10(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_11):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_11_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $11, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $11, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $11, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_11_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $11, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $11, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_11_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $11, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $11, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_11_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_11_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_11_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 11(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_12):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_12_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $12, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $12, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $12, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_12_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $12, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $12, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_12_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $12, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $12, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_12_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_12_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_12_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 12(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_13):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_13_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $13, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $13, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $13, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_13_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $13, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $13, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_13_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $13, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $13, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_13_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_13_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_13_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 13(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_14):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_14_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $14, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $14, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $14, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_14_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $14, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $14, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_14_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $14, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $14, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_14_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_14_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_14_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 14(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_15):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_15_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $15, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $15, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $15, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_15_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $15, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $15, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_15_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $15, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $15, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_15_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_15_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_15_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 15(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-# endif
- .p2align 4
-L(exit):
- pmovmskb %xmm1, %r8d
- sub $0xffff, %r8d
- jz L(first16bytes)
- lea -16(%rsi), %rsi
- lea -16(%rdi), %rdi
- mov %r8d, %edx
-L(first16bytes):
- add %rax, %rsi
-L(less16bytes):
-# ifndef USE_AS_WMEMCMP
- test %dl, %dl
- jz L(next_24_bytes)
-
- test $0x01, %dl
- jnz L(Byte16)
-
- test $0x02, %dl
- jnz L(Byte17)
-
- test $0x04, %dl
- jnz L(Byte18)
-
- test $0x08, %dl
- jnz L(Byte19)
-
- test $0x10, %dl
- jnz L(Byte20)
-
- test $0x20, %dl
- jnz L(Byte21)
-
- test $0x40, %dl
- jnz L(Byte22)
-
- movzbl -9(%rdi), %eax
- movzbl -9(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte16):
- movzbl -16(%rdi), %eax
- movzbl -16(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte17):
- movzbl -15(%rdi), %eax
- movzbl -15(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte18):
- movzbl -14(%rdi), %eax
- movzbl -14(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte19):
- movzbl -13(%rdi), %eax
- movzbl -13(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte20):
- movzbl -12(%rdi), %eax
- movzbl -12(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte21):
- movzbl -11(%rdi), %eax
- movzbl -11(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte22):
- movzbl -10(%rdi), %eax
- movzbl -10(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(next_24_bytes):
- lea 8(%rdi), %rdi
- lea 8(%rsi), %rsi
- test $0x01, %dh
- jnz L(Byte16)
-
- test $0x02, %dh
- jnz L(Byte17)
-
- test $0x04, %dh
- jnz L(Byte18)
-
- test $0x08, %dh
- jnz L(Byte19)
-
- test $0x10, %dh
- jnz L(Byte20)
-
- test $0x20, %dh
- jnz L(Byte21)
-
- test $0x40, %dh
- jnz L(Byte22)
-
- movzbl -9(%rdi), %eax
- movzbl -9(%rsi), %edx
- sub %edx, %eax
- ret
-# else
-/* special for wmemcmp */
- xor %eax, %eax
- test %dl, %dl
- jz L(next_two_double_words)
- and $15, %dl
- jz L(second_double_word)
- mov -16(%rdi), %eax
- cmp -16(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(second_double_word):
- mov -12(%rdi), %eax
- cmp -12(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(next_two_double_words):
- and $15, %dh
- jz L(fourth_double_word)
- mov -8(%rdi), %eax
- cmp -8(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(fourth_double_word):
- mov -4(%rdi), %eax
- cmp -4(%rsi), %eax
- jne L(find_diff)
- ret
-# endif
-
- .p2align 4
-L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $0, %ecx
- je L(0bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $1, %ecx
- je L(1bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $3, %ecx
- je L(3bytes)
- cmp $4, %ecx
- je L(4bytes)
- cmp $5, %ecx
- je L(5bytes)
- cmp $6, %ecx
- je L(6bytes)
- jmp L(7bytes)
-# else
- jmp L(4bytes)
-# endif
-
- .p2align 4
-L(more8bytes):
- cmp $16, %ecx
- jae L(more16bytes)
- cmp $8, %ecx
- je L(8bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $9, %ecx
- je L(9bytes)
- cmp $10, %ecx
- je L(10bytes)
- cmp $11, %ecx
- je L(11bytes)
- cmp $12, %ecx
- je L(12bytes)
- cmp $13, %ecx
- je L(13bytes)
- cmp $14, %ecx
- je L(14bytes)
- jmp L(15bytes)
-# else
- jmp L(12bytes)
-# endif
-
- .p2align 4
-L(more16bytes):
- cmp $24, %ecx
- jae L(more24bytes)
- cmp $16, %ecx
- je L(16bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $17, %ecx
- je L(17bytes)
- cmp $18, %ecx
- je L(18bytes)
- cmp $19, %ecx
- je L(19bytes)
- cmp $20, %ecx
- je L(20bytes)
- cmp $21, %ecx
- je L(21bytes)
- cmp $22, %ecx
- je L(22bytes)
- jmp L(23bytes)
-# else
- jmp L(20bytes)
-# endif
-
- .p2align 4
-L(more24bytes):
- cmp $32, %ecx
- jae L(more32bytes)
- cmp $24, %ecx
- je L(24bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $25, %ecx
- je L(25bytes)
- cmp $26, %ecx
- je L(26bytes)
- cmp $27, %ecx
- je L(27bytes)
- cmp $28, %ecx
- je L(28bytes)
- cmp $29, %ecx
- je L(29bytes)
- cmp $30, %ecx
- je L(30bytes)
- jmp L(31bytes)
-# else
- jmp L(28bytes)
-# endif
-
- .p2align 4
-L(more32bytes):
- cmp $40, %ecx
- jae L(more40bytes)
- cmp $32, %ecx
- je L(32bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $33, %ecx
- je L(33bytes)
- cmp $34, %ecx
- je L(34bytes)
- cmp $35, %ecx
- je L(35bytes)
- cmp $36, %ecx
- je L(36bytes)
- cmp $37, %ecx
- je L(37bytes)
- cmp $38, %ecx
- je L(38bytes)
- jmp L(39bytes)
-# else
- jmp L(36bytes)
-# endif
-
- .p2align 4
-L(more40bytes):
- cmp $40, %ecx
- je L(40bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $41, %ecx
- je L(41bytes)
- cmp $42, %ecx
- je L(42bytes)
- cmp $43, %ecx
- je L(43bytes)
- cmp $44, %ecx
- je L(44bytes)
- cmp $45, %ecx
- je L(45bytes)
- cmp $46, %ecx
- je L(46bytes)
- jmp L(47bytes)
-
- .p2align 4
-L(44bytes):
- movl -44(%rdi), %eax
- movl -44(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(40bytes):
- movl -40(%rdi), %eax
- movl -40(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(36bytes):
- movl -36(%rdi), %eax
- movl -36(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(32bytes):
- movl -32(%rdi), %eax
- movl -32(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(28bytes):
- movl -28(%rdi), %eax
- movl -28(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(24bytes):
- movl -24(%rdi), %eax
- movl -24(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(20bytes):
- movl -20(%rdi), %eax
- movl -20(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(16bytes):
- movl -16(%rdi), %eax
- movl -16(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(12bytes):
- movl -12(%rdi), %eax
- movl -12(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(8bytes):
- movl -8(%rdi), %eax
- movl -8(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(4bytes):
- movl -4(%rdi), %eax
- movl -4(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(0bytes):
- xor %eax, %eax
- ret
-# else
- .p2align 4
-L(44bytes):
- movl -44(%rdi), %eax
- cmp -44(%rsi), %eax
- jne L(find_diff)
-L(40bytes):
- movl -40(%rdi), %eax
- cmp -40(%rsi), %eax
- jne L(find_diff)
-L(36bytes):
- movl -36(%rdi), %eax
- cmp -36(%rsi), %eax
- jne L(find_diff)
-L(32bytes):
- movl -32(%rdi), %eax
- cmp -32(%rsi), %eax
- jne L(find_diff)
-L(28bytes):
- movl -28(%rdi), %eax
- cmp -28(%rsi), %eax
- jne L(find_diff)
-L(24bytes):
- movl -24(%rdi), %eax
- cmp -24(%rsi), %eax
- jne L(find_diff)
-L(20bytes):
- movl -20(%rdi), %eax
- cmp -20(%rsi), %eax
- jne L(find_diff)
-L(16bytes):
- movl -16(%rdi), %eax
- cmp -16(%rsi), %eax
- jne L(find_diff)
-L(12bytes):
- movl -12(%rdi), %eax
- cmp -12(%rsi), %eax
- jne L(find_diff)
-L(8bytes):
- movl -8(%rdi), %eax
- cmp -8(%rsi), %eax
- jne L(find_diff)
-L(4bytes):
- movl -4(%rdi), %eax
- cmp -4(%rsi), %eax
- jne L(find_diff)
-L(0bytes):
- xor %eax, %eax
- ret
-# endif
-
-# ifndef USE_AS_WMEMCMP
- .p2align 4
-L(45bytes):
- movl -45(%rdi), %eax
- movl -45(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(41bytes):
- movl -41(%rdi), %eax
- movl -41(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(37bytes):
- movl -37(%rdi), %eax
- movl -37(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(33bytes):
- movl -33(%rdi), %eax
- movl -33(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(29bytes):
- movl -29(%rdi), %eax
- movl -29(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(25bytes):
- movl -25(%rdi), %eax
- movl -25(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(21bytes):
- movl -21(%rdi), %eax
- movl -21(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(17bytes):
- movl -17(%rdi), %eax
- movl -17(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(13bytes):
- movl -13(%rdi), %eax
- movl -13(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(9bytes):
- movl -9(%rdi), %eax
- movl -9(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(5bytes):
- movl -5(%rdi), %eax
- movl -5(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(1bytes):
- movzbl -1(%rdi), %eax
- cmpb -1(%rsi), %al
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(46bytes):
- movl -46(%rdi), %eax
- movl -46(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(42bytes):
- movl -42(%rdi), %eax
- movl -42(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(38bytes):
- movl -38(%rdi), %eax
- movl -38(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(34bytes):
- movl -34(%rdi), %eax
- movl -34(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(30bytes):
- movl -30(%rdi), %eax
- movl -30(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(26bytes):
- movl -26(%rdi), %eax
- movl -26(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(22bytes):
- movl -22(%rdi), %eax
- movl -22(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(18bytes):
- movl -18(%rdi), %eax
- movl -18(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(14bytes):
- movl -14(%rdi), %eax
- movl -14(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(10bytes):
- movl -10(%rdi), %eax
- movl -10(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(6bytes):
- movl -6(%rdi), %eax
- movl -6(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(2bytes):
- movzwl -2(%rdi), %eax
- movzwl -2(%rsi), %ecx
- cmpb %cl, %al
- jne L(set)
- cmp %ecx, %eax
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(47bytes):
- movl -47(%rdi), %eax
- movl -47(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(43bytes):
- movl -43(%rdi), %eax
- movl -43(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(39bytes):
- movl -39(%rdi), %eax
- movl -39(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(35bytes):
- movl -35(%rdi), %eax
- movl -35(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(31bytes):
- movl -31(%rdi), %eax
- movl -31(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(27bytes):
- movl -27(%rdi), %eax
- movl -27(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(23bytes):
- movl -23(%rdi), %eax
- movl -23(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(19bytes):
- movl -19(%rdi), %eax
- movl -19(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(15bytes):
- movl -15(%rdi), %eax
- movl -15(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(11bytes):
- movl -11(%rdi), %eax
- movl -11(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(7bytes):
- movl -7(%rdi), %eax
- movl -7(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(3bytes):
- movzwl -3(%rdi), %eax
- movzwl -3(%rsi), %ecx
- cmpb %cl, %al
- jne L(set)
- cmp %ecx, %eax
- jne L(set)
- movzbl -1(%rdi), %eax
- cmpb -1(%rsi), %al
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(find_diff):
- cmpb %cl, %al
- jne L(set)
- cmpw %cx, %ax
- jne L(set)
- shr $16, %eax
- shr $16, %ecx
- cmpb %cl, %al
- jne L(set)
-
-/* We get there only if we already know there is a
-difference. */
-
- cmp %ecx, %eax
-L(set):
- sbb %eax, %eax
- sbb $-1, %eax
- ret
-# else
-
-/* for wmemcmp */
- .p2align 4
-L(find_diff):
- mov $1, %eax
- jg L(find_diff_bigger)
- neg %eax
- ret
-
- .p2align 4
-L(find_diff_bigger):
- ret
-# endif
-
- .p2align 4
-L(equal):
- xor %eax, %eax
- ret
-
-END (MEMCMP)
-#endif
deleted file mode 100644
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_ssse3
-
-#include "memcmp-ssse3.S"