x86-64: Add memcmp/wmemcmp optimized with AVX2

Message ID 20170601154519.GB14526@lucon.org
State New, archived
Headers

Commit Message

Lu, Hongjiu June 1, 2017, 3:45 p.m. UTC
  Optimize x86-64 memcmp/wmemcmp with AVX2.  It uses vector compare as
much as possible.  It is as fast as SSE4 memcmp for size <= 16 bytes
and up to 2X faster for size > 16 bytes on Haswell and Skylake.  Select
AVX2 memcmp/wmemcmp on AVX2 machines where vzeroupper is preferred and
AVX unaligned load is fast.

Key features:

1. Use overlapping compare to avoid branch.
2. Use vector compare when size >= 4 bytes for memcmp or size >= 8
   bytes for wmemcmp.
3. If size is 8 * VEC_SIZE or less, unroll the loop.
4. Compare 4 * VEC_SIZE at a time with the aligned first memory area.
5. Use 2 vector compares when size is 2 * VEC_SIZE or less.
6. Use 4 vector compares when size is 4 * VEC_SIZE or less.
7. Use 8 vector compares when size is 8 * VEC_SIZE or less.

Any comments?

H.J.
---
	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
	memcmp-avx2 and wmemcmp-avx2.
	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Test __memcmp_avx2 and __wmemcmp_avx2.
	* sysdeps/x86_64/multiarch/memcmp-avx2.S: New file.
	* sysdeps/x86_64/multiarch/wmemcmp-avx2.S: Likewise.
	* sysdeps/x86_64/multiarch/memcmp.S: Use __memcmp_avx2 on AVX
	2 machines if AVX unaligned load is fast and vzeroupper is
	preferred.
	* sysdeps/x86_64/multiarch/wmemcmp.S: Use __wmemcmp_avx2 on AVX
	2 machines if AVX unaligned load is fast and vzeroupper is
	preferred.
---
 sysdeps/x86_64/multiarch/Makefile          |   5 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
 sysdeps/x86_64/multiarch/memcmp-avx2.S     | 430 +++++++++++++++++++++++++++++
 sysdeps/x86_64/multiarch/memcmp.S          |  11 +-
 sysdeps/x86_64/multiarch/wmemcmp-avx2.S    |   4 +
 sysdeps/x86_64/multiarch/wmemcmp.S         |  11 +-
 6 files changed, 464 insertions(+), 3 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-avx2.S
  

Comments

Florian Weimer June 1, 2017, 4:41 p.m. UTC | #1
On 06/01/2017 05:45 PM, H.J. Lu wrote:
> +L(between_4_7):
> +	vmovd	(%rdi), %xmm1
> +	vmovd	(%rsi), %xmm2
> +	VPCMPEQ %xmm1, %xmm2, %xmm2
> +	vpmovmskb %xmm2, %eax
> +	subl    $0xffff, %eax
> +	jnz	L(first_vec)

Is this really faster than two 32-bit bswaps followed by a sub?

> +	leaq	-4(%rdi, %rdx), %rdi
> +	leaq	-4(%rsi, %rdx), %rsi
> +	vmovd	(%rdi), %xmm1
> +	vmovd	(%rsi), %xmm2
> +	VPCMPEQ %xmm1, %xmm2, %xmm2
> +	vpmovmskb %xmm2, %eax
> +	subl    $0xffff, %eax
> +	jnz	L(first_vec)
> +	ret

What is ensuring alignment, so that the vmovd instructions cannot fault?

> +	.p2align 4
> +L(between_2_3):
> +	/* Load 2 bytes into registers.  */
> +	movzwl	(%rdi), %eax
> +	movzwl	(%rsi), %ecx
> +	/* Compare the lowest byte.  */
> +	cmpb	%cl, %al
> +	jne	L(1byte_reg)
> +	/* Load the difference of 2 bytes into EAX.  */
> +	subl	%ecx, %eax
> +	/* Return if 2 bytes differ.  */
> +	jnz	L(exit)
> +	cmpb	$2, %dl
> +	/* Return if these are the last 2 bytes.  */
> +	je	L(exit)
> +	movzbl	2(%rdi), %eax
> +	movzbl	2(%rsi), %ecx
> +	subl	%ecx, %eax
> +	ret

Again, bswap should be faster, and if we assume that the ordering of the
inputs is more difficult to predict than the length, it would be better
to construct the full 24-bit value before comparing it.

Thanks,
Florian
  
H.J. Lu June 1, 2017, 5:19 p.m. UTC | #2
On Thu, Jun 1, 2017 at 9:41 AM, Florian Weimer <fweimer@redhat.com> wrote:
> On 06/01/2017 05:45 PM, H.J. Lu wrote:
>> +L(between_4_7):
>> +     vmovd   (%rdi), %xmm1
>> +     vmovd   (%rsi), %xmm2
>> +     VPCMPEQ %xmm1, %xmm2, %xmm2
>> +     vpmovmskb %xmm2, %eax
>> +     subl    $0xffff, %eax
>> +     jnz     L(first_vec)
>
> Is this really faster than two 32-bit bswaps followed by a sub?

Can you elaborate how to use bswap here?

>> +     leaq    -4(%rdi, %rdx), %rdi
>> +     leaq    -4(%rsi, %rdx), %rsi
>> +     vmovd   (%rdi), %xmm1
>> +     vmovd   (%rsi), %xmm2
>> +     VPCMPEQ %xmm1, %xmm2, %xmm2
>> +     vpmovmskb %xmm2, %eax
>> +     subl    $0xffff, %eax
>> +     jnz     L(first_vec)
>> +     ret
>
> What is ensuring alignment, so that the vmovd instructions cannot fault?

What do you mean?  This sequence compares the last 4 bytes with
vmovd,  which loads 4 bytes and zeroes out the high 12 bytes, and
VPCMPEQ.  If they aren't the same, go to L(first_vec).

>> +     .p2align 4
>> +L(between_2_3):
>> +     /* Load 2 bytes into registers.  */
>> +     movzwl  (%rdi), %eax
>> +     movzwl  (%rsi), %ecx
>> +     /* Compare the lowest byte.  */
>> +     cmpb    %cl, %al
>> +     jne     L(1byte_reg)
>> +     /* Load the difference of 2 bytes into EAX.  */
>> +     subl    %ecx, %eax
>> +     /* Return if 2 bytes differ.  */
>> +     jnz     L(exit)
>> +     cmpb    $2, %dl
>> +     /* Return if these are the last 2 bytes.  */
>> +     je      L(exit)
>> +     movzbl  2(%rdi), %eax
>> +     movzbl  2(%rsi), %ecx
>> +     subl    %ecx, %eax
>> +     ret
>
> Again, bswap should be faster, and if we assume that the ordering of the
> inputs is more difficult to predict than the length, it would be better
> to construct the full 24-bit value before comparing it.
>

Can you elaborate it here?

Thanks.
  
Florian Weimer June 1, 2017, 6:39 p.m. UTC | #3
On 06/01/2017 07:19 PM, H.J. Lu wrote:
> On Thu, Jun 1, 2017 at 9:41 AM, Florian Weimer <fweimer@redhat.com> wrote:
>> On 06/01/2017 05:45 PM, H.J. Lu wrote:
>>> +L(between_4_7):
>>> +     vmovd   (%rdi), %xmm1
>>> +     vmovd   (%rsi), %xmm2
>>> +     VPCMPEQ %xmm1, %xmm2, %xmm2
>>> +     vpmovmskb %xmm2, %eax
>>> +     subl    $0xffff, %eax
>>> +     jnz     L(first_vec)
>>
>> Is this really faster than two 32-bit bswaps followed by a sub?
> 
> Can you elaborate how to use bswap here?

Something like this:

  /* Load 4 to 7 bytes into an 8-byte word.
     ABCDEFG turns into GFEDDCBA.
     ABCDEF  turns into FEDCDCBA.
     ABCDE   turns into EDCBDCBA.
     ABCD    turns into DCBADCBA.
     bswapq below reverses the order of bytes.
     The duplicated bytes do not affect the comparison result.  */
  movl -4(%rdi, %rdx), R1
  shrq $32, R1
  movl -4(%rsi, %rdx), R2
  shrq $32, R2
  movl ($rdi), R3
  orq R3, R1
  /* Variant below starts after this point. */
  cmpq R1, R2
  jne L(diffin8bytes)
  xor %eax, %eax
  ret

L(diffin8bytes):
  bswapq R1
  bswapq R2
  cmpq R1, R2
  sbbl %eax, %eax	/* Set to -1 if R1 < R2, otherwise 0.  */
  orl $1, %eax		/* Turn 0 into 1, but preserve -1.  */
  ret

(Not sure about the right ordering for R1 and R2 here.)

There's a way to avoid the conditional jump completely, but whether
that's worthwhile depends on the cost of the bswapq and the cmove:

  bswapq R1
  bswapq R2
  xorl R3, R3
  cmpq R1, R2
  sbbl %eax, %eax
  orl $1, %eax
  cmpq R1, R2
  cmove R3, %eax
  ret

See this patch and the related discussion:

  <https://sourceware.org/ml/libc-alpha/2014-02/msg00139.html>

>> What is ensuring alignment, so that the vmovd instructions cannot fault?
> 
> What do you mean?  This sequence compares the last 4 bytes with
> vmovd,  which loads 4 bytes and zeroes out the high 12 bytes, and
> VPCMPEQ.  If they aren't the same, go to L(first_vec).

Ah, I see now.  The loads overlap.  Maybe add a comment to that effect?

Thanks,
Florian
  
H.J. Lu June 1, 2017, 8:57 p.m. UTC | #4
On Thu, Jun 1, 2017 at 11:39 AM, Florian Weimer <fweimer@redhat.com> wrote:
> On 06/01/2017 07:19 PM, H.J. Lu wrote:
>> On Thu, Jun 1, 2017 at 9:41 AM, Florian Weimer <fweimer@redhat.com> wrote:
>>> On 06/01/2017 05:45 PM, H.J. Lu wrote:
>>>> +L(between_4_7):
>>>> +     vmovd   (%rdi), %xmm1
>>>> +     vmovd   (%rsi), %xmm2
>>>> +     VPCMPEQ %xmm1, %xmm2, %xmm2
>>>> +     vpmovmskb %xmm2, %eax
>>>> +     subl    $0xffff, %eax
>>>> +     jnz     L(first_vec)
>>>
>>> Is this really faster than two 32-bit bswaps followed by a sub?
>>
>> Can you elaborate how to use bswap here?
>
> Something like this:
>
>   /* Load 4 to 7 bytes into an 8-byte word.
>      ABCDEFG turns into GFEDDCBA.
>      ABCDEF  turns into FEDCDCBA.
>      ABCDE   turns into EDCBDCBA.
>      ABCD    turns into DCBADCBA.
>      bswapq below reverses the order of bytes.
>      The duplicated bytes do not affect the comparison result.  */
>   movl -4(%rdi, %rdx), R1
>   shrq $32, R1
>   movl -4(%rsi, %rdx), R2
>   shrq $32, R2
>   movl ($rdi), R3
>   orq R3, R1
>   /* Variant below starts after this point. */
>   cmpq R1, R2
>   jne L(diffin8bytes)
>   xor %eax, %eax
>   ret
>
> L(diffin8bytes):
>   bswapq R1
>   bswapq R2
>   cmpq R1, R2
>   sbbl %eax, %eax       /* Set to -1 if R1 < R2, otherwise 0.  */
>   orl $1, %eax          /* Turn 0 into 1, but preserve -1.  */
>   ret

I don't think it works with memcmp since return value depends on
the first bytes which differs.  Say

ABCDE   turns into EDCBDCBA

If all bytes differs, we should only compare A, not EDCBDCBA.

> (Not sure about the right ordering for R1 and R2 here.)
>
> There's a way to avoid the conditional jump completely, but whether
> that's worthwhile depends on the cost of the bswapq and the cmove:
>
>   bswapq R1
>   bswapq R2
>   xorl R3, R3
>   cmpq R1, R2
>   sbbl %eax, %eax
>   orl $1, %eax
>   cmpq R1, R2
>   cmove R3, %eax
>   ret
>
> See this patch and the related discussion:
>
>   <https://sourceware.org/ml/libc-alpha/2014-02/msg00139.html>
>
>>> What is ensuring alignment, so that the vmovd instructions cannot fault?
>>
>> What do you mean?  This sequence compares the last 4 bytes with
>> vmovd,  which loads 4 bytes and zeroes out the high 12 bytes, and
>> VPCMPEQ.  If they aren't the same, go to L(first_vec).
>
> Ah, I see now.  The loads overlap.  Maybe add a comment to that effect?

I will add

/* Use overlapping loads to avoid branches.  */
  
Florian Weimer June 1, 2017, 9 p.m. UTC | #5
On 06/01/2017 10:57 PM, H.J. Lu wrote:
> I don't think it works with memcmp since return value depends on
> the first bytes which differs.  Say
> 
> ABCDE   turns into EDCBDCBA
> 
> If all bytes differs, we should only compare A, not EDCBDCBA.

That's what the bswapq is for, it reverses the order of bytes.

Florian
  
H.J. Lu June 1, 2017, 9:17 p.m. UTC | #6
On Thu, Jun 1, 2017 at 2:00 PM, Florian Weimer <fweimer@redhat.com> wrote:
> On 06/01/2017 10:57 PM, H.J. Lu wrote:
>> I don't think it works with memcmp since return value depends on
>> the first bytes which differs.  Say
>>
>> ABCDE   turns into EDCBDCBA
>>
>> If all bytes differs, we should only compare A, not EDCBDCBA.
>
> That's what the bswapq is for, it reverses the order of bytes.
>

bswapq doesn't help since cmpq compares 8 bytes but only
the last byte matters.   Comparing the highest byte give you the
wrong result, like

0x36775382d1367753
0x7b8d14025b7b8d14
  
Florian Weimer June 1, 2017, 9:20 p.m. UTC | #7
On 06/01/2017 11:17 PM, H.J. Lu wrote:
> On Thu, Jun 1, 2017 at 2:00 PM, Florian Weimer <fweimer@redhat.com> wrote:
>> On 06/01/2017 10:57 PM, H.J. Lu wrote:
>>> I don't think it works with memcmp since return value depends on
>>> the first bytes which differs.  Say
>>>
>>> ABCDE   turns into EDCBDCBA
>>>
>>> If all bytes differs, we should only compare A, not EDCBDCBA.
>>
>> That's what the bswapq is for, it reverses the order of bytes.
>>
> 
> bswapq doesn't help since cmpq compares 8 bytes but only
> the last byte matters.   Comparing the highest byte give you the
> wrong result, like
> 
> 0x36775382d1367753
> 0x7b8d14025b7b8d14

I don't understand.  On big-endian, to compare two 8-byte arrays as if
by memcmp, you can certainly do a uint64_t load, compute the difference
as a 65-bit value, and return the integer sign of that.

The code I posted does that (modulo bugs, but you can get a working
patch from the old message I referenced).  bswapq is needed to get an
equivalent to that big-endian load.

Thanks,
Florian
  
H.J. Lu June 1, 2017, 9:29 p.m. UTC | #8
On Thu, Jun 1, 2017 at 2:20 PM, Florian Weimer <fweimer@redhat.com> wrote:
> On 06/01/2017 11:17 PM, H.J. Lu wrote:
>> On Thu, Jun 1, 2017 at 2:00 PM, Florian Weimer <fweimer@redhat.com> wrote:
>>> On 06/01/2017 10:57 PM, H.J. Lu wrote:
>>>> I don't think it works with memcmp since return value depends on
>>>> the first bytes which differs.  Say
>>>>
>>>> ABCDE   turns into EDCBDCBA
>>>>
>>>> If all bytes differs, we should only compare A, not EDCBDCBA.
>>>
>>> That's what the bswapq is for, it reverses the order of bytes.
>>>
>>
>> bswapq doesn't help since cmpq compares 8 bytes but only
>> the last byte matters.   Comparing the highest byte give you the
>> wrong result, like
>>
>> 0x36775382d1367753
>> 0x7b8d14025b7b8d14
>
> I don't understand.  On big-endian, to compare two 8-byte arrays as if
> by memcmp, you can certainly do a uint64_t load, compute the difference
> as a 65-bit value, and return the integer sign of that.
>
> The code I posted does that (modulo bugs, but you can get a working
> patch from the old message I referenced).  bswapq is needed to get an
> equivalent to that big-endian load.
>

I put memcmp-avx2.S on hjl/avx2/master branch and changed it
to

L(between_4_7):
        movl    (%rdi), %r8d
        movl    (%rsi), %ecx
        shlq    $32, %r8
        shlq    $32, %rcx
        movl    -4(%rdi, %rdx), %edi
        movl    -4(%rsi, %rdx), %esi
        orq     %rdi, %r8
        orq     %rsi, %rcx
        bswap   %r8
        bswap   %rcx
        cmpq    %rcx, %r8
        je      L(zero)
        sbbl    %eax, %eax
        orl     $1, %eax
        ret

and got

Iteration 70485 - wrong result in function __memcmp_avx2 (18, 26, 5,
0) -1 != 1, p1 0x7ffff7ff0e00 p2 0x7ffff7fece00

Where did I do wrong?
  
Florian Weimer June 1, 2017, 9:31 p.m. UTC | #9
On 06/01/2017 11:29 PM, H.J. Lu wrote:
> L(between_4_7):
>         movl    (%rdi), %r8d
>         movl    (%rsi), %ecx
>         shlq    $32, %r8
>         shlq    $32, %rcx
>         movl    -4(%rdi, %rdx), %edi
>         movl    -4(%rsi, %rdx), %esi
>         orq     %rdi, %r8
>         orq     %rsi, %rcx
>         bswap   %r8
>         bswap   %rcx
>         cmpq    %rcx, %r8
>         je      L(zero)
>         sbbl    %eax, %eax
>         orl     $1, %eax
>         ret
> 
> and got
> 
> Iteration 70485 - wrong result in function __memcmp_avx2 (18, 26, 5,
> 0) -1 != 1, p1 0x7ffff7ff0e00 p2 0x7ffff7fece00
> 
> Where did I do wrong?

I think you created some PDP-endian thing there.  The 4 bytes at (%rdi)
need to remain in the lower part of %r8, up until the bswap.  In other
words, you need to shift the 4 bytes loaded from -4(%rdi, %rdx).

Thanks,
Florian
  
Richard Henderson June 1, 2017, 10:14 p.m. UTC | #10
On 06/01/2017 02:20 PM, Florian Weimer wrote:
>    bswapq is needed to get an
> equivalent to that big-endian load.

Don't forget about movbe, which will also be available on these avx2 machines.


r~
  
Ondrej Bilka June 15, 2017, 12:34 p.m. UTC | #11
On Thu, Jun 01, 2017 at 08:45:19AM -0700, H.J. Lu wrote:
> Optimize x86-64 memcmp/wmemcmp with AVX2.  It uses vector compare as
> much as possible.  It is as fast as SSE4 memcmp for size <= 16 bytes
> and up to 2X faster for size > 16 bytes on Haswell and Skylake.  Select
> AVX2 memcmp/wmemcmp on AVX2 machines where vzeroupper is preferred and
> AVX unaligned load is fast.
> 
> Key features:
> 
> 1. Use overlapping compare to avoid branch.
> 2. Use vector compare when size >= 4 bytes for memcmp or size >= 8
>    bytes for wmemcmp.
> 3. If size is 8 * VEC_SIZE or less, unroll the loop.
> 4. Compare 4 * VEC_SIZE at a time with the aligned first memory area.
> 5. Use 2 vector compares when size is 2 * VEC_SIZE or less.
> 6. Use 4 vector compares when size is 4 * VEC_SIZE or less.
> 7. Use 8 vector compares when size is 8 * VEC_SIZE or less.
> 
> Any comments?
>
I have some comments, its similar to one of my previous patches

> +     cmpq    $(VEC_SIZE * 2), %rdx
> +     ja      L(more_2x_vec)
> +
This is unnecessary branch, its likely that there is difference in first
16 bytes regardless of size. Move test about sizes...
> +L(last_2x_vec):
> +     /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +     vmovdqu (%rsi), %ymm2
> +     VPCMPEQ (%rdi), %ymm2, %ymm2
> +     vpmovmskb %ymm2, %eax
> +     subl    $VEC_MASK, %eax
> +     jnz     L(first_vec)
here.



> +L(first_vec):
> +	/* A byte or int32 is different within 16 or 32 bytes.  */
> +	bsfl	%eax, %ecx
> +# ifdef USE_AS_WMEMCMP
> +	xorl	%eax, %eax
> +	movl	(%rdi, %rcx), %edx
> +	cmpl	(%rsi, %rcx), %edx
> +L(wmemcmp_return):
> +	setl	%al
> +	negl	%eax
> +	orl	$1, %eax
> +# else
> +	movzbl	(%rdi, %rcx), %eax
> +	movzbl	(%rsi, %rcx), %edx
> +	sub	%edx, %eax
> +# endif
> +	VZEROUPPER
> +	ret
> +

Loading bytes depending on result of bsf is slow, alternative is to find
that from vector tests. I could avoid it using tests like this but I
didn't measure performance/test it yet.

vmovdqu (%rdi), %ymm3

VPCMPGTQ %ymm2, %ymm3, %ymm4
VPCMPGTQ %ymm3, %ymm2, %ymm5
vpmovmskb %ymm4, %eax
vpmovmskb %ymm5, %edx
neg %eax
neg %edx
lzcnt %eax, %eax
lzcnt %edx, %edx
sub %edx, %eax
ret



> +	.p2align 4
> +L(less_vec):
> +# ifdef USE_AS_WMEMCMP
> +	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
> +	cmpb	$4, %dl
> +	je	L(4)
> +	jb	L(zero)
> +# else
> +	cmpb	$1, %dl
> +	je	L(1)
> +	jb	L(zero)
> +	cmpb	$4, %dl
> +	jb	L(between_2_3)
> +	cmpb	$8, %dl
> +	jb	L(between_4_7)
> +# endif
> +	cmpb	$16, %dl
> +	jae	L(between_16_31)

I am net entirely sure about this as it depends on if one calls memcmp
with fixed sizes in loop or not. If size is unpredictable first test if
loads cross page boudary for special case. if not do 32-byte comparison
and if first different byte is bigger than size return 0.
  
H.J. Lu June 16, 2017, 2:15 a.m. UTC | #12
On Thu, Jun 15, 2017 at 5:34 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Thu, Jun 01, 2017 at 08:45:19AM -0700, H.J. Lu wrote:
>> Optimize x86-64 memcmp/wmemcmp with AVX2.  It uses vector compare as
>> much as possible.  It is as fast as SSE4 memcmp for size <= 16 bytes
>> and up to 2X faster for size > 16 bytes on Haswell and Skylake.  Select
>> AVX2 memcmp/wmemcmp on AVX2 machines where vzeroupper is preferred and
>> AVX unaligned load is fast.
>>
>> Key features:
>>
>> 1. Use overlapping compare to avoid branch.
>> 2. Use vector compare when size >= 4 bytes for memcmp or size >= 8
>>    bytes for wmemcmp.
>> 3. If size is 8 * VEC_SIZE or less, unroll the loop.
>> 4. Compare 4 * VEC_SIZE at a time with the aligned first memory area.
>> 5. Use 2 vector compares when size is 2 * VEC_SIZE or less.
>> 6. Use 4 vector compares when size is 4 * VEC_SIZE or less.
>> 7. Use 8 vector compares when size is 8 * VEC_SIZE or less.
>>
>> Any comments?
>>
> I have some comments, its similar to one of my previous patches
>
>> +     cmpq    $(VEC_SIZE * 2), %rdx
>> +     ja      L(more_2x_vec)
>> +
> This is unnecessary branch, its likely that there is difference in first
> 16 bytes regardless of size. Move test about sizes...
>> +L(last_2x_vec):
>> +     /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
>> +     vmovdqu (%rsi), %ymm2
>> +     VPCMPEQ (%rdi), %ymm2, %ymm2
>> +     vpmovmskb %ymm2, %eax
>> +     subl    $VEC_MASK, %eax
>> +     jnz     L(first_vec)
> here.
>

If we do that, the size check will be redundant from

        /* Less than 4 * VEC.  */
        cmpq    $VEC_SIZE, %rdx
        jbe     L(last_vec)
        cmpq    $(VEC_SIZE * 2), %rdx
        jbe     L(last_2x_vec)

L(last_4x_vec):

Of cause, we can duplicate these blocks to avoid size.

>
>> +L(first_vec):
>> +     /* A byte or int32 is different within 16 or 32 bytes.  */
>> +     bsfl    %eax, %ecx
>> +# ifdef USE_AS_WMEMCMP
>> +     xorl    %eax, %eax
>> +     movl    (%rdi, %rcx), %edx
>> +     cmpl    (%rsi, %rcx), %edx
>> +L(wmemcmp_return):
>> +     setl    %al
>> +     negl    %eax
>> +     orl     $1, %eax
>> +# else
>> +     movzbl  (%rdi, %rcx), %eax
>> +     movzbl  (%rsi, %rcx), %edx
>> +     sub     %edx, %eax
>> +# endif
>> +     VZEROUPPER
>> +     ret
>> +
>
> Loading bytes depending on result of bsf is slow, alternative is to find
> that from vector tests. I could avoid it using tests like this but I
> didn't measure performance/test it yet.
>
> vmovdqu (%rdi), %ymm3
>
> VPCMPGTQ %ymm2, %ymm3, %ymm4
> VPCMPGTQ %ymm3, %ymm2, %ymm5
> vpmovmskb %ymm4, %eax
> vpmovmskb %ymm5, %edx
> neg %eax
> neg %edx
> lzcnt %eax, %eax
> lzcnt %edx, %edx
> sub %edx, %eax
> ret

Andrew, can you give it a try?

>
>
>> +     .p2align 4
>> +L(less_vec):
>> +# ifdef USE_AS_WMEMCMP
>> +     /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
>> +     cmpb    $4, %dl
>> +     je      L(4)
>> +     jb      L(zero)
>> +# else
>> +     cmpb    $1, %dl
>> +     je      L(1)
>> +     jb      L(zero)
>> +     cmpb    $4, %dl
>> +     jb      L(between_2_3)
>> +     cmpb    $8, %dl
>> +     jb      L(between_4_7)
>> +# endif
>> +     cmpb    $16, %dl
>> +     jae     L(between_16_31)
>
> I am net entirely sure about this as it depends on if one calls memcmp
> with fixed sizes in loop or not. If size is unpredictable first test if
> loads cross page boudary for special case. if not do 32-byte comparison
> and if first different byte is bigger than size return 0.

There are 2 loads from 2 different sources.  We need to do 2 address
checks before using 32-byte vector comparison,   I don't know if it will
be faster.
  
Andrew Senkevich June 17, 2017, 10:44 a.m. UTC | #13
2017-06-16 4:15 GMT+02:00 H.J. Lu <hjl.tools@gmail.com>:
> On Thu, Jun 15, 2017 at 5:34 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> On Thu, Jun 01, 2017 at 08:45:19AM -0700, H.J. Lu wrote:
>>> Optimize x86-64 memcmp/wmemcmp with AVX2.  It uses vector compare as
>>> much as possible.  It is as fast as SSE4 memcmp for size <= 16 bytes
>>> and up to 2X faster for size > 16 bytes on Haswell and Skylake.  Select
>>> AVX2 memcmp/wmemcmp on AVX2 machines where vzeroupper is preferred and
>>> AVX unaligned load is fast.
>>>
>>> Key features:
>>>
>>> 1. Use overlapping compare to avoid branch.
>>> 2. Use vector compare when size >= 4 bytes for memcmp or size >= 8
>>>    bytes for wmemcmp.
>>> 3. If size is 8 * VEC_SIZE or less, unroll the loop.
>>> 4. Compare 4 * VEC_SIZE at a time with the aligned first memory area.
>>> 5. Use 2 vector compares when size is 2 * VEC_SIZE or less.
>>> 6. Use 4 vector compares when size is 4 * VEC_SIZE or less.
>>> 7. Use 8 vector compares when size is 8 * VEC_SIZE or less.
>>>
>>> Any comments?
>>>
>> I have some comments, its similar to one of my previous patches
>>
>>> +     cmpq    $(VEC_SIZE * 2), %rdx
>>> +     ja      L(more_2x_vec)
>>> +
>> This is unnecessary branch, its likely that there is difference in first
>> 16 bytes regardless of size. Move test about sizes...
>>> +L(last_2x_vec):
>>> +     /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
>>> +     vmovdqu (%rsi), %ymm2
>>> +     VPCMPEQ (%rdi), %ymm2, %ymm2
>>> +     vpmovmskb %ymm2, %eax
>>> +     subl    $VEC_MASK, %eax
>>> +     jnz     L(first_vec)
>> here.
>>
>
> If we do that, the size check will be redundant from
>
>         /* Less than 4 * VEC.  */
>         cmpq    $VEC_SIZE, %rdx
>         jbe     L(last_vec)
>         cmpq    $(VEC_SIZE * 2), %rdx
>         jbe     L(last_2x_vec)
>
> L(last_4x_vec):
>
> Of cause, we can duplicate these blocks to avoid size.
>
>>
>>> +L(first_vec):
>>> +     /* A byte or int32 is different within 16 or 32 bytes.  */
>>> +     bsfl    %eax, %ecx
>>> +# ifdef USE_AS_WMEMCMP
>>> +     xorl    %eax, %eax
>>> +     movl    (%rdi, %rcx), %edx
>>> +     cmpl    (%rsi, %rcx), %edx
>>> +L(wmemcmp_return):
>>> +     setl    %al
>>> +     negl    %eax
>>> +     orl     $1, %eax
>>> +# else
>>> +     movzbl  (%rdi, %rcx), %eax
>>> +     movzbl  (%rsi, %rcx), %edx
>>> +     sub     %edx, %eax
>>> +# endif
>>> +     VZEROUPPER
>>> +     ret
>>> +
>>
>> Loading bytes depending on result of bsf is slow, alternative is to find
>> that from vector tests. I could avoid it using tests like this but I
>> didn't measure performance/test it yet.
>>
>> vmovdqu (%rdi), %ymm3
>>
>> VPCMPGTQ %ymm2, %ymm3, %ymm4
>> VPCMPGTQ %ymm3, %ymm2, %ymm5
>> vpmovmskb %ymm4, %eax
>> vpmovmskb %ymm5, %edx
>> neg %eax
>> neg %edx
>> lzcnt %eax, %eax
>> lzcnt %edx, %edx
>> sub %edx, %eax
>> ret
>
> Andrew, can you give it a try?

Hi Ondrej, could you send patch with you proposal?
I have tried with the following change and got many test-memcmp wrong results:

<       leaq    -VEC_SIZE(%rdi, %rdx), %rdi
<       leaq    -VEC_SIZE(%rsi, %rdx), %rsi
<       vmovdqu (%rsi), %ymm2
<       VPCMPEQ (%rdi), %ymm2, %ymm2
---
>       leaq    -VEC_SIZE(%rdi, %rdx), %r8
>       leaq    -VEC_SIZE(%rsi, %rdx), %r9
>       vmovdqu (%r9), %ymm2
>       VPCMPEQ (%r8), %ymm2, %ymm2
91,104c91,103
<       tzcntl  %eax, %ecx
< # ifdef USE_AS_WMEMCMP
<       xorl    %eax, %eax
<       movl    (%rdi, %rcx), %edx
<       cmpl    (%rsi, %rcx), %edx
< L(wmemcmp_return):
<       setl    %al
<       negl    %eax
<       orl     $1, %eax
< # else
<       movzbl  (%rdi, %rcx), %eax
<       movzbl  (%rsi, %rcx), %edx
<       sub     %edx, %eax
< # endif
---
>       vmovdqu (%rsi), %ymm2
>       vmovdqu (%rdi), %ymm3
>
>       VPCMPGTQ %ymm2, %ymm3, %ymm4
>       VPCMPGTQ %ymm3, %ymm2, %ymm5
>       vpmovmskb %ymm4, %eax
>       vpmovmskb %ymm5, %edx
>       neg %eax
>       neg %edx
>       lzcnt %eax, %eax
>       lzcnt %edx, %edx
>       sub %edx, %eax
>

Thanks.


--
WBR,
Andrew
  
H.J. Lu June 20, 2017, 6:16 p.m. UTC | #14
On Sat, Jun 17, 2017 at 3:44 AM, Andrew Senkevich
<andrew.n.senkevich@gmail.com> wrote:
> 2017-06-16 4:15 GMT+02:00 H.J. Lu <hjl.tools@gmail.com>:
>> On Thu, Jun 15, 2017 at 5:34 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>>> On Thu, Jun 01, 2017 at 08:45:19AM -0700, H.J. Lu wrote:
>>>> Optimize x86-64 memcmp/wmemcmp with AVX2.  It uses vector compare as
>>>> much as possible.  It is as fast as SSE4 memcmp for size <= 16 bytes
>>>> and up to 2X faster for size > 16 bytes on Haswell and Skylake.  Select
>>>> AVX2 memcmp/wmemcmp on AVX2 machines where vzeroupper is preferred and
>>>> AVX unaligned load is fast.
>>>>
>>>> Key features:
>>>>
>>>> 1. Use overlapping compare to avoid branch.
>>>> 2. Use vector compare when size >= 4 bytes for memcmp or size >= 8
>>>>    bytes for wmemcmp.
>>>> 3. If size is 8 * VEC_SIZE or less, unroll the loop.
>>>> 4. Compare 4 * VEC_SIZE at a time with the aligned first memory area.
>>>> 5. Use 2 vector compares when size is 2 * VEC_SIZE or less.
>>>> 6. Use 4 vector compares when size is 4 * VEC_SIZE or less.
>>>> 7. Use 8 vector compares when size is 8 * VEC_SIZE or less.
>>>>
>>>> Any comments?
>>>>
>>> I have some comments, its similar to one of my previous patches
>>>
>>>> +     cmpq    $(VEC_SIZE * 2), %rdx
>>>> +     ja      L(more_2x_vec)
>>>> +
>>> This is unnecessary branch, its likely that there is difference in first
>>> 16 bytes regardless of size. Move test about sizes...
>>>> +L(last_2x_vec):
>>>> +     /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
>>>> +     vmovdqu (%rsi), %ymm2
>>>> +     VPCMPEQ (%rdi), %ymm2, %ymm2
>>>> +     vpmovmskb %ymm2, %eax
>>>> +     subl    $VEC_MASK, %eax
>>>> +     jnz     L(first_vec)
>>> here.
>>>
>>
>> If we do that, the size check will be redundant from
>>
>>         /* Less than 4 * VEC.  */
>>         cmpq    $VEC_SIZE, %rdx
>>         jbe     L(last_vec)
>>         cmpq    $(VEC_SIZE * 2), %rdx
>>         jbe     L(last_2x_vec)
>>
>> L(last_4x_vec):
>>
>> Of cause, we can duplicate these blocks to avoid size.
>>
>>>
>>>> +L(first_vec):
>>>> +     /* A byte or int32 is different within 16 or 32 bytes.  */
>>>> +     bsfl    %eax, %ecx
>>>> +# ifdef USE_AS_WMEMCMP
>>>> +     xorl    %eax, %eax
>>>> +     movl    (%rdi, %rcx), %edx
>>>> +     cmpl    (%rsi, %rcx), %edx
>>>> +L(wmemcmp_return):
>>>> +     setl    %al
>>>> +     negl    %eax
>>>> +     orl     $1, %eax
>>>> +# else
>>>> +     movzbl  (%rdi, %rcx), %eax
>>>> +     movzbl  (%rsi, %rcx), %edx
>>>> +     sub     %edx, %eax
>>>> +# endif
>>>> +     VZEROUPPER
>>>> +     ret
>>>> +
>>>
>>> Loading bytes depending on result of bsf is slow, alternative is to find
>>> that from vector tests. I could avoid it using tests like this but I
>>> didn't measure performance/test it yet.
>>>
>>> vmovdqu (%rdi), %ymm3
>>>
>>> VPCMPGTQ %ymm2, %ymm3, %ymm4
>>> VPCMPGTQ %ymm3, %ymm2, %ymm5
>>> vpmovmskb %ymm4, %eax
>>> vpmovmskb %ymm5, %edx
>>> neg %eax
>>> neg %edx
>>> lzcnt %eax, %eax
>>> lzcnt %edx, %edx
>>> sub %edx, %eax
>>> ret
>>
>> Andrew, can you give it a try?
>
> Hi Ondrej, could you send patch with you proposal?
> I have tried with the following change and got many test-memcmp wrong results:

We can't use VPCMPGT for memcmp since it performs signed
comparison, but memcmp requires unsigned comparison.

H.J.
  

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 3736f54..a62def3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -6,6 +6,7 @@  ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
+		   memcmp-avx2 \
 		   memcmp-sse4 memcpy-ssse3 \
 		   memmove-ssse3 \
 		   memcpy-ssse3-back \
@@ -30,5 +31,7 @@  CFLAGS-strspn-c.c += -msse4
 endif
 
 ifeq ($(subdir),wcsmbs)
-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
+sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+		   wmemcmp-avx2 \
+		   wcscpy-ssse3 wcscpy-c
 endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a91d2f9..35f1960 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -40,6 +40,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memcmp.S.  */
   IFUNC_IMPL (i, name, memcmp,
+	      IFUNC_IMPL_ADD (array, i, memcmp,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memcmp_avx2)
 	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1),
 			      __memcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
@@ -294,6 +297,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/wmemcmp.S.  */
   IFUNC_IMPL (i, name, wmemcmp,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemcmp_avx2)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1),
 			      __wmemcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S
new file mode 100644
index 0000000..8e3872a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S
@@ -0,0 +1,430 @@ 
+/* memcmp/wmemcmp optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+/* memcmp/wmemcmp is implemented as:
+   1. Use overlapping compare to avoid branch.
+   2. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+      bytes for wmemcmp.
+   3. If size is 8 * VEC_SIZE or less, unroll the loop.
+   4. Compare 4 * VEC_SIZE at a time with the aligned first memory
+      area.
+   5. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+   6. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+   7. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_avx2
+# endif
+
+# ifdef USE_AS_WMEMCMP
+#  define VPCMPEQ	vpcmpeqd
+# else
+#  define VPCMPEQ	vpcmpeqb
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE 32
+# define VEC_MASK ((1 << VEC_SIZE) - 1)
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.avx,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %rdx
+# endif
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+L(last_vec):
+	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec):
+	/* A byte or int32 is different within 16 or 32 bytes.  */
+	bsfl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(%rdi, %rcx), %edx
+	cmpl	(%rsi, %rcx), %edx
+L(wmemcmp_return):
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+# else
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+# ifdef USE_AS_WMEMCMP
+	.p2align 4
+L(4):
+	xorl	%eax, %eax
+	movl	(%rdi), %edx
+	cmpl	(%rsi), %edx
+	jne	L(wmemcmp_return)
+	ret
+# else
+	.p2align 4
+L(between_4_7):
+	vmovd	(%rdi), %xmm1
+	vmovd	(%rsi), %xmm2
+	VPCMPEQ %xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	leaq	-4(%rdi, %rdx), %rdi
+	leaq	-4(%rsi, %rdx), %rsi
+	vmovd	(%rdi), %xmm1
+	vmovd	(%rsi), %xmm2
+	VPCMPEQ %xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(between_2_3):
+	/* Load 2 bytes into registers.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	/* Compare the lowest byte.  */
+	cmpb	%cl, %al
+	jne	L(1byte_reg)
+	/* Load the difference of 2 bytes into EAX.  */
+	subl	%ecx, %eax
+	/* Return if 2 bytes differ.  */
+	jnz	L(exit)
+	cmpb	$2, %dl
+	/* Return if these are the last 2 bytes.  */
+	je	L(exit)
+	movzbl	2(%rdi), %eax
+	movzbl	2(%rsi), %ecx
+	subl	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit):
+	ret
+
+	.p2align 4
+L(1byte_reg):
+	movzbl	%al, %eax
+	movzbl	%cl, %ecx
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+L(1):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	subl	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(less_vec):
+# ifdef USE_AS_WMEMCMP
+	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+	cmpb	$4, %dl
+	je	L(4)
+	jb	L(zero)
+# else
+	cmpb	$1, %dl
+	je	L(1)
+	jb	L(zero)
+	cmpb	$4, %dl
+	jb	L(between_2_3)
+	cmpb	$8, %dl
+	jb	L(between_4_7)
+# endif
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+	/* It is between 8 and 15 bytes.  */
+	vmovq	(%rdi), %xmm1
+	vmovq	(%rsi), %xmm2
+	VPCMPEQ %xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	leaq	-8(%rdi, %rdx), %rdi
+	leaq	-8(%rsi, %rdx), %rsi
+	vmovq	(%rdi), %xmm1
+	vmovq	(%rsi), %xmm2
+	VPCMPEQ %xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(between_16_31):
+	/* From 16 to 31 bytes.  No branch when size == 16.  */
+	vmovdqu	(%rsi), %xmm2
+	VPCMPEQ (%rdi), %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+
+	leaq	-16(%rdi, %rdx), %rdi
+	leaq	-16(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %xmm2
+	VPCMPEQ (%rdi), %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(more_2x_vec):
+	/* More than 2 * VEC.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(last_4x_vec)
+
+	/* From 4 * VEC to 8 * VEC, inclusively. */
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+
+	vpand	%ymm1, %ymm2, %ymm5
+	vpand	%ymm3, %ymm4, %ymm6
+	vpand	%ymm5, %ymm6, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+
+	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+	vpand	%ymm2, %ymm1, %ymm5
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vpand	%ymm3, %ymm5, %ymm5
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	vpand	%ymm4, %ymm5, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(more_8x_vec):
+	/* More than 8 * VEC.  Check the first VEC.  */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Align the first memory area for aligned loads in the loop.
+	   Compute how much the first memory area is misaligned.  */
+	movq	%rdi, %rcx
+	andl	$(VEC_SIZE - 1), %ecx
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %rcx
+	/* Adjust the second memory area.  */
+	subq	%rcx, %rsi
+	/* Adjust the first memory area which should be aligned now.  */
+	subq	%rcx, %rdi
+	/* Adjust length.  */
+	addq	%rcx, %rdx
+
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+	vpand	%ymm2, %ymm1, %ymm5
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vpand	%ymm3, %ymm5, %ymm5
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	vpand	%ymm4, %ymm5, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rsi
+
+	subq	$(VEC_SIZE * 4), %rdx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jae	L(loop_4x_vec)
+
+	/* Less than 4 * VEC.  */
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(last_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(last_2x_vec)
+
+L(last_4x_vec):
+	/* From 2 * VEC to 4 * VEC. */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	vpmovmskb %ymm1, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec)
+	vpmovmskb %ymm2, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x1)
+	vpmovmskb %ymm3, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x2)
+	vpmovmskb %ymm4, %eax
+	subl	$VEC_MASK, %eax
+	bsfl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	bsfl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	VEC_SIZE(%rdi, %rcx), %edx
+	cmpl	VEC_SIZE(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	bsfl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index 6129820..08acacb 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -27,7 +27,16 @@ 
 ENTRY(memcmp)
 	.type	memcmp, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_CPU_FEATURE (SSSE3)
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__memcmp_avx2(%rip), %rax
+	ret
+
+1:	HAS_CPU_FEATURE (SSSE3)
 	jnz	2f
 	leaq	__memcmp_sse2(%rip), %rax
 	ret
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2.S
new file mode 100644
index 0000000..aa2190b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2.S
@@ -0,0 +1,4 @@ 
+#define MEMCMP __wmemcmp_avx2
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
index 5dc54d7..46ee8f5 100644
--- a/sysdeps/x86_64/multiarch/wmemcmp.S
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -27,7 +27,16 @@ 
 ENTRY(wmemcmp)
 	.type	wmemcmp, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_CPU_FEATURE (SSSE3)
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wmemcmp_avx2(%rip), %rax
+	ret
+
+1:	HAS_CPU_FEATURE (SSSE3)
 	jnz	2f
 	leaq	__wmemcmp_sse2(%rip), %rax
 	ret