x86_64: memcpy/memmove family optimized with AVX512

Message ID CAMXFM3uGLiFE+pKPzFgWP6Sx4C3w2Ktd4w3+35O0Bj=B1s0naA@mail.gmail.com
State New, archived
Headers

Commit Message

Andrew Senkevich Jan. 12, 2016, 2:13 p.m. UTC
  Hi,

here is AVX512 implementations of memcpy, mempcpy, memmove,
memcpy_chk, mempcpy_chk, memmove_chk.
It shows average improvement more than 30% over AVX versions on KNL
hardware, performance results attached.
Ok for trunk?

2016-01-12  Andrew Senkevich  <andrew.senkevich@intel.com>

        * sysdeps/x86_64/multiarch/Makefile: Added new files.
        * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Added new tests.
        * sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S: New file.
        * sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S: Likewise.
        * sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: Likewise.
        * sysdeps/x86_64/multiarch/memcpy.S: Added new IFUNC branch.
        * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise.
        * sysdeps/x86_64/multiarch/memmove.c: Likewise.
        * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
        * sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
        * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise.



--
WBR,
Andrew
  

Comments

Adhemerval Zanella Netto Jan. 12, 2016, 6:37 p.m. UTC | #1
On 12-01-2016 12:13, Andrew Senkevich wrote:
> Hi,
> 
> here is AVX512 implementations of memcpy, mempcpy, memmove,
> memcpy_chk, mempcpy_chk, memmove_chk.
> It shows average improvement more than 30% over AVX versions on KNL
> hardware, performance results attached.
> Ok for trunk?

It is too late for 2.23, but ok after review for 2.24.
  
H.J. Lu Jan. 13, 2016, 2:21 p.m. UTC | #2
On Tue, Jan 12, 2016 at 6:13 AM, Andrew Senkevich
<andrew.n.senkevich@gmail.com> wrote:
> Hi,
>
> here is AVX512 implementations of memcpy, mempcpy, memmove,
> memcpy_chk, mempcpy_chk, memmove_chk.
> It shows average improvement more than 30% over AVX versions on KNL
> hardware, performance results attached.
> Ok for trunk?
>
> 2016-01-12  Andrew Senkevich  <andrew.senkevich@intel.com>
>
>         * sysdeps/x86_64/multiarch/Makefile: Added new files.
>         * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Added new tests.
>         * sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S: New file.
>         * sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S: Likewise.
>         * sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: Likewise.
>         * sysdeps/x86_64/multiarch/memcpy.S: Added new IFUNC branch.
>         * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise.
>         * sysdeps/x86_64/multiarch/memmove.c: Likewise.
>         * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
>         * sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
>         * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise.
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile
> b/sysdeps/x86_64/multiarch/Makefile
> index b2e31ef..d234f4a 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -7,11 +7,12 @@ ifeq ($(subdir),string)
>
>  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
>     strcmp-sse2-unaligned strncmp-ssse3 \
> -   memcmp-sse4 memcpy-ssse3 \
> -   memcpy-sse2-unaligned mempcpy-ssse3 \
> -   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> -   memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
> -   memmove-ssse3-back strcasecmp_l-ssse3 \
> +   memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
> +   memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \
> +   memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \
> +   memcpy-avx-unaligned mempcpy-avx-unaligned \
> +   mempcpy-avx512-no-vzeroupper memmove-ssse3-back \
> +   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
>     strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
>     strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
>     strcpy-sse2-unaligned strncpy-sse2-unaligned \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 5f600dc..7746d79 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -24,7 +24,7 @@
>  #include "init-arch.h"
>
>  /* Maximum number of IFUNC implementations.  */
> -#define MAX_IFUNC 4
> +#define MAX_IFUNC 5
>
>  /* Fill ARRAY of MAX elements with IFUNC implementations for function
>     NAME supported on target machine and return the number of valid
> @@ -46,9 +46,12 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>        __memcmp_ssse3)
>        IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>
> -  /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
> +  /* Support sysdeps/x86_64/multiarch/memmove_chk.c.  */
>    IFUNC_IMPL (i, name, __memmove_chk,
>        IFUNC_IMPL_ADD (array, i, __memmove_chk,
> +      HAS_ARCH_FEATURE (AVX512F_Usable),
> +      __memmove_chk_avx512_no_vzeroupper)
> +      IFUNC_IMPL_ADD (array, i, __memmove_chk,
>        HAS_ARCH_FEATURE (AVX_Usable),
>        __memmove_chk_avx_unaligned)
>        IFUNC_IMPL_ADD (array, i, __memmove_chk,
> @@ -65,6 +68,9 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>        IFUNC_IMPL_ADD (array, i, memmove,
>        HAS_ARCH_FEATURE (AVX_Usable),
>        __memmove_avx_unaligned)
> +      IFUNC_IMPL_ADD (array, i, memmove,
> +      HAS_ARCH_FEATURE (AVX512F_Usable),
> +      __memmove_avx512_no_vzeroupper)
>        IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
>        __memmove_ssse3_back)
>        IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
> @@ -274,6 +280,9 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>        __memcpy_ssse3_back)
>        IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
>        __memcpy_ssse3)
> +      IFUNC_IMPL_ADD (array, i, memcpy,
> +      HAS_ARCH_FEATURE (AVX512F_Usable),
> +      __memcpy_avx512_no_vzeroupper)
>        IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
>        IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
>
> @@ -294,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
>    IFUNC_IMPL (i, name, mempcpy,
>        IFUNC_IMPL_ADD (array, i, mempcpy,
> +      HAS_ARCH_FEATURE (AVX512F_Usable),
> +      __mempcpy_avx512_no_vzeroupper)
> +      IFUNC_IMPL_ADD (array, i, mempcpy,
>        HAS_ARCH_FEATURE (AVX_Usable),
>        __mempcpy_avx_unaligned)
>        IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),

Please add _chk tests.


> diff --git a/sysdeps/x86_64/multiarch/memcpy.S
> b/sysdeps/x86_64/multiarch/memcpy.S
> index 27fca29..64a1bcd 100644
> --- a/sysdeps/x86_64/multiarch/memcpy.S
> +++ b/sysdeps/x86_64/multiarch/memcpy.S
> @@ -30,19 +30,27 @@
>  ENTRY(__new_memcpy)
>   .type __new_memcpy, @gnu_indirect_function
>   LOAD_RTLD_GLOBAL_RO_RDX
> - leaq __memcpy_avx_unaligned(%rip), %rax
> +#ifdef HAVE_AVX512_ASM_SUPPORT
> + HAS_ARCH_FEATURE (AVX512F_Usable)
> + jz 1f
> + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
> + jz 1f
> + leaq    __memcpy_avx512_no_vzeroupper(%rip), %rax
> + ret
> +#endif
> +1: leaq __memcpy_avx_unaligned(%rip), %rax
>   HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
> - jz 1f
> + jz 2f
>   ret
> -1: leaq __memcpy_sse2(%rip), %rax
> +2: leaq __memcpy_sse2(%rip), %rax
>   HAS_ARCH_FEATURE (Slow_BSF)
> - jnz 2f
> + jnz 3f
>   leaq __memcpy_sse2_unaligned(%rip), %rax
>   ret
> -2: HAS_CPU_FEATURE (SSSE3)
> - jz 3f
> +3: HAS_CPU_FEATURE (SSSE3)
> + jz 4f
>   leaq    __memcpy_ssse3(%rip), %rax
> -3: ret
> +4: ret
>  END(__new_memcpy)
>
>  # undef ENTRY

Please find a way not to re-order labels when adding a new
implementation next time.

Thanks.
  
Andrew Senkevich Jan. 13, 2016, 6:10 p.m. UTC | #3
> On 12-01-2016 12:13, Andrew Senkevich wrote:

>> Hi,

>> 

>> here is AVX512 implementations of memcpy, mempcpy, memmove, 

>> memcpy_chk, mempcpy_chk, memmove_chk.

>> It shows average improvement more than 30% over AVX versions on KNL 

>> hardware, performance results attached.

>> Ok for trunk?

>

> It is too late for 2.23, but ok after review for 2.24.


We would like this patch to be considered for glibc 2.23 since the functionality completes AVX-512 improvements of mem* routines.
Memset tuned for AVX-512 is already checked in so it looks reasonable to have full support in 2.23.
Also the changes are strongly AVX-512 specific, not adding any new interfaces so potential risk of the patch is pretty low.

We already got review comments without any major questions to the patch and fixed version will be ready today.

Given all this can the patch go to current glibc trunk after review is finished?


--
Andrew
  
Carlos O'Donell Jan. 13, 2016, 6:20 p.m. UTC | #4
On 01/12/2016 09:13 AM, Andrew Senkevich wrote:
> Hi,
> 
> here is AVX512 implementations of memcpy, mempcpy, memmove,
> memcpy_chk, mempcpy_chk, memmove_chk.
> It shows average improvement more than 30% over AVX versions on KNL
> hardware, performance results attached.
> Ok for trunk?
> 
> 2016-01-12  Andrew Senkevich  <andrew.senkevich@intel.com>
> 
>         * sysdeps/x86_64/multiarch/Makefile: Added new files.
>         * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Added new tests.
>         * sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S: New file.
>         * sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S: Likewise.
>         * sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: Likewise.
>         * sysdeps/x86_64/multiarch/memcpy.S: Added new IFUNC branch.
>         * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise.
>         * sysdeps/x86_64/multiarch/memmove.c: Likewise.
>         * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
>         * sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
>         * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise.

Looks good to me.

Thanks for the results. Yes, it looks like consistently ~30% over AVX.
My only thoughts are: How does this scale as you add more threads to
the process that try to use those functional units? Have you done any
scalability testing on these implementations?

Cheers,
Carlos.
  
Carlos O'Donell Jan. 13, 2016, 6:22 p.m. UTC | #5
On 01/13/2016 01:10 PM, Senkevich, Andrew wrote:
>> On 12-01-2016 12:13, Andrew Senkevich wrote:
>>> Hi,
>>> 
>>> here is AVX512 implementations of memcpy, mempcpy, memmove, 
>>> memcpy_chk, mempcpy_chk, memmove_chk. It shows average
>>> improvement more than 30% over AVX versions on KNL hardware,
>>> performance results attached. Ok for trunk?
>> 
>> It is too late for 2.23, but ok after review for 2.24.
> 
> We would like this patch to be considered for glibc 2.23 since the
> functionality completes AVX-512 improvements of mem* routines. Memset
> tuned for AVX-512 is already checked in so it looks reasonable to
> have full support in 2.23. Also the changes are strongly AVX-512
> specific, not adding any new interfaces so potential risk of the
> patch is pretty low.
> 
> We already got review comments without any major questions to the
> patch and fixed version will be ready today.
> 
> Given all this can the patch go to current glibc trunk after review
> is finished?

I've complete my review and the patches look good. I had one question
which should not block acceptance for 2.23 (wanted to know about the
scalability of AVX512 across threads in a process and how the routines
perform versus the others on a scalability perspective e.g. some
functional groups might not scale well due to interlocks and other
reasons).

Given that this work is half complete I think we should accept Intel's
patches and complete the work for 2.23.

A NEWS note is required to highlight the new support for AVX512.

Cheers,
Carlos.
  
Adhemerval Zanella Netto Jan. 13, 2016, 6:34 p.m. UTC | #6
On 13-01-2016 16:10, Senkevich, Andrew wrote:
>> On 12-01-2016 12:13, Andrew Senkevich wrote:
>>> Hi,
>>>
>>> here is AVX512 implementations of memcpy, mempcpy, memmove, 
>>> memcpy_chk, mempcpy_chk, memmove_chk.
>>> It shows average improvement more than 30% over AVX versions on KNL 
>>> hardware, performance results attached.
>>> Ok for trunk?
>>
>> It is too late for 2.23, but ok after review for 2.24.
> 
> We would like this patch to be considered for glibc 2.23 since the functionality completes AVX-512 improvements of mem* routines.
> Memset tuned for AVX-512 is already checked in so it looks reasonable to have full support in 2.23.
> Also the changes are strongly AVX-512 specific, not adding any new interfaces so potential risk of the patch is pretty low.
> 
> We already got review comments without any major questions to the patch and fixed version will be ready today.
> 
> Given all this can the patch go to current glibc trunk after review is finished?

Right, as I talked with Carlos in IRC I will let x86 maintainers have the final
word on this inclusion.
  
Adhemerval Zanella Netto Jan. 13, 2016, 6:36 p.m. UTC | #7
On 13-01-2016 12:21, H.J. Lu wrote:
> On Tue, Jan 12, 2016 at 6:13 AM, Andrew Senkevich
> <andrew.n.senkevich@gmail.com> wrote:
>> Hi,
>>
>> here is AVX512 implementations of memcpy, mempcpy, memmove,
>> memcpy_chk, mempcpy_chk, memmove_chk.
>> It shows average improvement more than 30% over AVX versions on KNL
>> hardware, performance results attached.
>> Ok for trunk?
>>
>> 2016-01-12  Andrew Senkevich  <andrew.senkevich@intel.com>
>>
>>         * sysdeps/x86_64/multiarch/Makefile: Added new files.
>>         * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Added new tests.
>>         * sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S: New file.
>>         * sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S: Likewise.
>>         * sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: Likewise.
>>         * sysdeps/x86_64/multiarch/memcpy.S: Added new IFUNC branch.
>>         * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise.
>>         * sysdeps/x86_64/multiarch/memmove.c: Likewise.
>>         * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
>>         * sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
>>         * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise.
>>
>> diff --git a/sysdeps/x86_64/multiarch/Makefile
>> b/sysdeps/x86_64/multiarch/Makefile
>> index b2e31ef..d234f4a 100644
>> --- a/sysdeps/x86_64/multiarch/Makefile
>> +++ b/sysdeps/x86_64/multiarch/Makefile
>> @@ -7,11 +7,12 @@ ifeq ($(subdir),string)
>>
>>  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
>>     strcmp-sse2-unaligned strncmp-ssse3 \
>> -   memcmp-sse4 memcpy-ssse3 \
>> -   memcpy-sse2-unaligned mempcpy-ssse3 \
>> -   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
>> -   memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
>> -   memmove-ssse3-back strcasecmp_l-ssse3 \
>> +   memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
>> +   memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \
>> +   memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \
>> +   memcpy-avx-unaligned mempcpy-avx-unaligned \
>> +   mempcpy-avx512-no-vzeroupper memmove-ssse3-back \
>> +   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
>>     strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
>>     strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
>>     strcpy-sse2-unaligned strncpy-sse2-unaligned \
>> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> index 5f600dc..7746d79 100644
>> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> @@ -24,7 +24,7 @@
>>  #include "init-arch.h"
>>
>>  /* Maximum number of IFUNC implementations.  */
>> -#define MAX_IFUNC 4
>> +#define MAX_IFUNC 5
>>
>>  /* Fill ARRAY of MAX elements with IFUNC implementations for function
>>     NAME supported on target machine and return the number of valid
>> @@ -46,9 +46,12 @@ __libc_ifunc_impl_list (const char *name, struct
>> libc_ifunc_impl *array,
>>        __memcmp_ssse3)
>>        IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>>
>> -  /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
>> +  /* Support sysdeps/x86_64/multiarch/memmove_chk.c.  */
>>    IFUNC_IMPL (i, name, __memmove_chk,
>>        IFUNC_IMPL_ADD (array, i, __memmove_chk,
>> +      HAS_ARCH_FEATURE (AVX512F_Usable),
>> +      __memmove_chk_avx512_no_vzeroupper)
>> +      IFUNC_IMPL_ADD (array, i, __memmove_chk,
>>        HAS_ARCH_FEATURE (AVX_Usable),
>>        __memmove_chk_avx_unaligned)
>>        IFUNC_IMPL_ADD (array, i, __memmove_chk,
>> @@ -65,6 +68,9 @@ __libc_ifunc_impl_list (const char *name, struct
>> libc_ifunc_impl *array,
>>        IFUNC_IMPL_ADD (array, i, memmove,
>>        HAS_ARCH_FEATURE (AVX_Usable),
>>        __memmove_avx_unaligned)
>> +      IFUNC_IMPL_ADD (array, i, memmove,
>> +      HAS_ARCH_FEATURE (AVX512F_Usable),
>> +      __memmove_avx512_no_vzeroupper)
>>        IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
>>        __memmove_ssse3_back)
>>        IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
>> @@ -274,6 +280,9 @@ __libc_ifunc_impl_list (const char *name, struct
>> libc_ifunc_impl *array,
>>        __memcpy_ssse3_back)
>>        IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
>>        __memcpy_ssse3)
>> +      IFUNC_IMPL_ADD (array, i, memcpy,
>> +      HAS_ARCH_FEATURE (AVX512F_Usable),
>> +      __memcpy_avx512_no_vzeroupper)
>>        IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
>>        IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
>>
>> @@ -294,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct
>> libc_ifunc_impl *array,
>>    /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
>>    IFUNC_IMPL (i, name, mempcpy,
>>        IFUNC_IMPL_ADD (array, i, mempcpy,
>> +      HAS_ARCH_FEATURE (AVX512F_Usable),
>> +      __mempcpy_avx512_no_vzeroupper)
>> +      IFUNC_IMPL_ADD (array, i, mempcpy,
>>        HAS_ARCH_FEATURE (AVX_Usable),
>>        __mempcpy_avx_unaligned)
>>        IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
> 
> Please add _chk tests.
> 
> 
>> diff --git a/sysdeps/x86_64/multiarch/memcpy.S
>> b/sysdeps/x86_64/multiarch/memcpy.S
>> index 27fca29..64a1bcd 100644
>> --- a/sysdeps/x86_64/multiarch/memcpy.S
>> +++ b/sysdeps/x86_64/multiarch/memcpy.S
>> @@ -30,19 +30,27 @@
>>  ENTRY(__new_memcpy)
>>   .type __new_memcpy, @gnu_indirect_function
>>   LOAD_RTLD_GLOBAL_RO_RDX
>> - leaq __memcpy_avx_unaligned(%rip), %rax
>> +#ifdef HAVE_AVX512_ASM_SUPPORT
>> + HAS_ARCH_FEATURE (AVX512F_Usable)
>> + jz 1f
>> + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
>> + jz 1f
>> + leaq    __memcpy_avx512_no_vzeroupper(%rip), %rax
>> + ret
>> +#endif
>> +1: leaq __memcpy_avx_unaligned(%rip), %rax
>>   HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
>> - jz 1f
>> + jz 2f
>>   ret
>> -1: leaq __memcpy_sse2(%rip), %rax
>> +2: leaq __memcpy_sse2(%rip), %rax
>>   HAS_ARCH_FEATURE (Slow_BSF)
>> - jnz 2f
>> + jnz 3f
>>   leaq __memcpy_sse2_unaligned(%rip), %rax
>>   ret
>> -2: HAS_CPU_FEATURE (SSSE3)
>> - jz 3f
>> +3: HAS_CPU_FEATURE (SSSE3)
>> + jz 4f
>>   leaq    __memcpy_ssse3(%rip), %rax
>> -3: ret
>> +4: ret
>>  END(__new_memcpy)
>>
>>  # undef ENTRY
> 
> Please find a way not to re-order labels when adding a new
> implementation next time.
> 

Maybe using 'libc_ifunc(...)' instead and let the compiler handle it?
  
Andrew Senkevich Jan. 15, 2016, 10:02 p.m. UTC | #8
> On 13-01-2016 16:10, Senkevich, Andrew wrote:

>>> On 12-01-2016 12:13, Andrew Senkevich wrote:

>>>> Hi,

>>>>

>>>> here is AVX512 implementations of memcpy, mempcpy, memmove, 

>>>> memcpy_chk, mempcpy_chk, memmove_chk.

>>>> It shows average improvement more than 30% over AVX versions on KNL 

>>>> hardware, performance results attached.

>>>> Ok for trunk?

>>>

>>> It is too late for 2.23, but ok after review for 2.24.

>> 

>> We would like this patch to be considered for glibc 2.23 since the functionality completes AVX-512 improvements of mem* routines.

>> Memset tuned for AVX-512 is already checked in so it looks reasonable to have full support in 2.23.

>> Also the changes are strongly AVX-512 specific, not adding any new interfaces so potential risk of the patch is pretty low.

>> 

>> We already got review comments without any major questions to the patch and fixed version will be ready today.

>> 

>> Given all this can the patch go to current glibc trunk after review is finished?

>

> Right, as I talked with Carlos in IRC I will let x86 maintainers have the final word on this inclusion.


Thanks!
  

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile
b/sysdeps/x86_64/multiarch/Makefile
index b2e31ef..d234f4a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,11 +7,12 @@  ifeq ($(subdir),string)

 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
    strcmp-sse2-unaligned strncmp-ssse3 \
-   memcmp-sse4 memcpy-ssse3 \
-   memcpy-sse2-unaligned mempcpy-ssse3 \
-   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
-   memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
-   memmove-ssse3-back strcasecmp_l-ssse3 \
+   memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
+   memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \
+   memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \
+   memcpy-avx-unaligned mempcpy-avx-unaligned \
+   mempcpy-avx512-no-vzeroupper memmove-ssse3-back \
+   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
    strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
    strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
    strcpy-sse2-unaligned strncpy-sse2-unaligned \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 5f600dc..7746d79 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -24,7 +24,7 @@ 
 #include "init-arch.h"

 /* Maximum number of IFUNC implementations.  */
-#define MAX_IFUNC 4
+#define MAX_IFUNC 5

 /* Fill ARRAY of MAX elements with IFUNC implementations for function
    NAME supported on target machine and return the number of valid
@@ -46,9 +46,12 @@  __libc_ifunc_impl_list (const char *name, struct
libc_ifunc_impl *array,
       __memcmp_ssse3)
       IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))

-  /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
+  /* Support sysdeps/x86_64/multiarch/memmove_chk.c.  */
   IFUNC_IMPL (i, name, __memmove_chk,
       IFUNC_IMPL_ADD (array, i, __memmove_chk,
+      HAS_ARCH_FEATURE (AVX512F_Usable),
+      __memmove_chk_avx512_no_vzeroupper)
+      IFUNC_IMPL_ADD (array, i, __memmove_chk,
       HAS_ARCH_FEATURE (AVX_Usable),
       __memmove_chk_avx_unaligned)
       IFUNC_IMPL_ADD (array, i, __memmove_chk,
@@ -65,6 +68,9 @@  __libc_ifunc_impl_list (const char *name, struct
libc_ifunc_impl *array,
       IFUNC_IMPL_ADD (array, i, memmove,
       HAS_ARCH_FEATURE (AVX_Usable),
       __memmove_avx_unaligned)
+      IFUNC_IMPL_ADD (array, i, memmove,
+      HAS_ARCH_FEATURE (AVX512F_Usable),
+      __memmove_avx512_no_vzeroupper)
       IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
       __memmove_ssse3_back)
       IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
@@ -274,6 +280,9 @@  __libc_ifunc_impl_list (const char *name, struct
libc_ifunc_impl *array,
       __memcpy_ssse3_back)
       IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
       __memcpy_ssse3)
+      IFUNC_IMPL_ADD (array, i, memcpy,
+      HAS_ARCH_FEATURE (AVX512F_Usable),
+      __memcpy_avx512_no_vzeroupper)
       IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
       IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))

@@ -294,6 +303,9 @@  __libc_ifunc_impl_list (const char *name, struct
libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
   IFUNC_IMPL (i, name, mempcpy,
       IFUNC_IMPL_ADD (array, i, mempcpy,
+      HAS_ARCH_FEATURE (AVX512F_Usable),
+      __mempcpy_avx512_no_vzeroupper)
+      IFUNC_IMPL_ADD (array, i, mempcpy,
       HAS_ARCH_FEATURE (AVX_Usable),
       __mempcpy_avx_unaligned)
       IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
new file mode 100644
index 0000000..462f038
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
@@ -0,0 +1,418 @@ 
+/* memcpy optimized with AVX512 for KNL hardware.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc) \
+    && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+#ifndef MEMCPY
+# define MEMCPY __memcpy_avx512_no_vzeroupper
+# define MEMCPY_CHK __memcpy_chk_avx512_no_vzeroupper
+#endif
+
+ .section .text,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+ mov %rdi, %rax
+#ifdef USE_AS_MEMPCPY
+ add %rdx, %rax
+#endif
+ lea (%rsi, %rdx), %rcx
+ lea (%rdi, %rdx), %r9
+ cmp $512, %rdx
+ ja L(512bytesormore)
+
+L(check):
+ cmp $16, %rdx
+ jbe L(less_16bytes)
+ cmp $256, %rdx
+ jb L(less_256bytes)
+ vmovups (%rsi), %zmm0
+ vmovups 0x40(%rsi), %zmm1
+ vmovups 0x80(%rsi), %zmm2
+ vmovups 0xC0(%rsi), %zmm3
+ vmovups -0x100(%rcx), %zmm4
+ vmovups -0xC0(%rcx), %zmm5
+ vmovups -0x80(%rcx), %zmm6
+ vmovups -0x40(%rcx), %zmm7
+ vmovups %zmm0, (%rdi)
+ vmovups %zmm1, 0x40(%rdi)
+ vmovups %zmm2, 0x80(%rdi)
+ vmovups %zmm3, 0xC0(%rdi)
+ vmovups %zmm4, -0x100(%r9)
+ vmovups %zmm5, -0xC0(%r9)
+ vmovups %zmm6, -0x80(%r9)
+ vmovups %zmm7, -0x40(%r9)
+ ret
+
+L(less_256bytes):
+ cmp $128, %dl
+ jb L(less_128bytes)
+ vmovups (%rsi), %zmm0
+ vmovups 0x40(%rsi), %zmm1
+ vmovups -0x80(%rcx), %zmm2
+ vmovups -0x40(%rcx), %zmm3
+ vmovups %zmm0, (%rdi)
+ vmovups %zmm1, 0x40(%rdi)
+ vmovups %zmm2, -0x80(%r9)
+ vmovups %zmm3, -0x40(%r9)
+ ret
+
+L(less_128bytes):
+ cmp $64, %dl
+ jb L(less_64bytes)
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 0x20(%rsi), %ymm1
+ vmovdqu -0x40(%rcx), %ymm2
+ vmovdqu -0x20(%rcx), %ymm3
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, 0x20(%rdi)
+ vmovdqu %ymm2, -0x40(%r9)
+ vmovdqu %ymm3, -0x20(%r9)
+ ret
+
+L(less_64bytes):
+ cmp $32, %dl
+ jb L(less_32bytes)
+ vmovdqu (%rsi), %ymm0
+ vmovdqu -0x20(%rcx), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, -0x20(%r9)
+ ret
+
+L(less_32bytes):
+ vmovdqu (%rsi), %xmm0
+ vmovdqu -0x10(%rcx), %xmm1
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm1, -0x10(%r9)
+ ret
+
+L(less_16bytes):
+ cmp $8, %dl
+ jb L(less_8bytes)
+ movq (%rsi), %rsi
+ movq -0x8(%rcx), %rcx
+ movq %rsi, (%rdi)
+ movq %rcx, -0x8(%r9)
+ ret
+
+L(less_8bytes):
+ cmp $4, %dl
+ jb L(less_4bytes)
+ mov (%rsi), %esi
+ mov -0x4(%rcx), %ecx
+ mov %esi, (%rdi)
+ mov %ecx, -0x4(%r9)
+ ret
+
+L(less_4bytes):
+ cmp $2, %dl
+ jb L(less_2bytes)
+ mov (%rsi), %si
+ mov -0x2(%rcx), %cx
+ mov %si, (%rdi)
+ mov %cx, -0x2(%r9)
+ ret
+
+L(less_2bytes):
+ cmp $1, %dl
+ jb L(less_1bytes)
+ mov (%rsi), %cl
+ mov %cl, (%rdi)
+L(less_1bytes):
+ ret
+
+L(512bytesormore):
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %r8
+#else
+ mov __x86_shared_cache_size_half(%rip), %r8
+#endif
+ cmp %r8, %rdx
+ jae L(preloop_large)
+ cmp $1024, %rdx
+ ja L(1024bytesormore)
+ vmovups (%rsi), %zmm0
+ vmovups 0x40(%rsi), %zmm1
+ vmovups 0x80(%rsi), %zmm2
+ vmovups 0xC0(%rsi), %zmm3
+ vmovups 0x100(%rsi), %zmm4
+ vmovups 0x140(%rsi), %zmm5
+ vmovups 0x180(%rsi), %zmm6
+ vmovups 0x1C0(%rsi), %zmm7
+ vmovups -0x200(%rcx), %zmm8
+ vmovups -0x1C0(%rcx), %zmm9
+ vmovups -0x180(%rcx), %zmm10
+ vmovups -0x140(%rcx), %zmm11
+ vmovups -0x100(%rcx), %zmm12
+ vmovups -0xC0(%rcx), %zmm13
+ vmovups -0x80(%rcx), %zmm14
+ vmovups -0x40(%rcx), %zmm15
+ vmovups %zmm0, (%rdi)
+ vmovups %zmm1, 0x40(%rdi)
+ vmovups %zmm2, 0x80(%rdi)
+ vmovups %zmm3, 0xC0(%rdi)
+ vmovups %zmm4, 0x100(%rdi)
+ vmovups %zmm5, 0x140(%rdi)
+ vmovups %zmm6, 0x180(%rdi)
+ vmovups %zmm7, 0x1C0(%rdi)
+ vmovups %zmm8, -0x200(%r9)
+ vmovups %zmm9, -0x1C0(%r9)
+ vmovups %zmm10, -0x180(%r9)
+ vmovups %zmm11, -0x140(%r9)
+ vmovups %zmm12, -0x100(%r9)
+ vmovups %zmm13, -0xC0(%r9)
+ vmovups %zmm14, -0x80(%r9)
+ vmovups %zmm15, -0x40(%r9)
+ ret
+
+#ifndef USE_AS_MEMMOVE
+L(1024bytesormore):
+ sub $0x200, %r9
+
+/* Loop with unaligned memory access.  */
+L(gobble_512bytes_loop):
+ vmovups (%rsi), %zmm0
+ vmovups 0x40(%rsi), %zmm1
+ vmovups 0x80(%rsi), %zmm2
+ vmovups 0xC0(%rsi), %zmm3
+ vmovups 0x100(%rsi), %zmm4
+ vmovups 0x140(%rsi), %zmm5
+ vmovups 0x180(%rsi), %zmm6
+ vmovups 0x1C0(%rsi), %zmm7
+ add $0x200, %rsi
+ vmovups %zmm0, (%rdi)
+ vmovups %zmm1, 0x40(%rdi)
+ vmovups %zmm2, 0x80(%rdi)
+ vmovups %zmm3, 0xC0(%rdi)
+ vmovups %zmm4, 0x100(%rdi)
+ vmovups %zmm5, 0x140(%rdi)
+ vmovups %zmm6, 0x180(%rdi)
+ vmovups %zmm7, 0x1C0(%rdi)
+ add $0x200, %rdi
+ cmp %r9, %rdi
+ jb L(gobble_512bytes_loop)
+ vmovups -0x200(%rcx), %zmm0
+ vmovups -0x1C0(%rcx), %zmm1
+ vmovups -0x180(%rcx), %zmm2
+ vmovups -0x140(%rcx), %zmm3
+ vmovups -0x100(%rcx), %zmm4
+ vmovups -0xC0(%rcx), %zmm5
+ vmovups -0x80(%rcx), %zmm6
+ vmovups -0x40(%rcx), %zmm7
+ vmovups %zmm0, (%r9)
+ vmovups %zmm1, 0x40(%r9)
+ vmovups %zmm2, 0x80(%r9)
+ vmovups %zmm3, 0xC0(%r9)
+ vmovups %zmm4, 0x100(%r9)
+ vmovups %zmm5, 0x140(%r9)
+ vmovups %zmm6, 0x180(%r9)
+ vmovups %zmm7, 0x1C0(%r9)
+ ret
+
+/* Align destination for access with non-temporal stores in the loop.  */
+L(preloop_large):
+ mov %rdi, %r8
+ and $-0x80, %rdi
+ add $0x80, %rdi
+ sub %rdi, %r8
+ vmovups (%rsi), %zmm0
+ vmovups 0x40(%rsi), %zmm1
+ vmovups %zmm0, (%rax)
+ vmovups %zmm1, 0x40(%rax)
+ sub %r8, %rsi
+ sub $0x100, %r9
+ prefetcht1 (%rsi)
+ prefetcht1 0x40(%rsi)
+ prefetcht1 0x80(%rsi)
+ prefetcht1 0xC0(%rsi)
+L(gobble_256bytes_nt_loop):
+ prefetcht1 0x100(%rsi)
+ prefetcht1 0x140(%rsi)
+ prefetcht1 0x180(%rsi)
+ prefetcht1 0x1C0(%rsi)
+ vmovups (%rsi), %zmm0
+ vmovups 0x40(%rsi), %zmm1
+ vmovups 0x80(%rsi), %zmm2
+ vmovups 0xC0(%rsi), %zmm3
+ add $0x100, %rsi
+ vmovntdq %zmm0, (%rdi)
+ vmovntdq %zmm1, 0x40(%rdi)
+ vmovntdq %zmm2, 0x80(%rdi)
+ vmovntdq %zmm3, 0xC0(%rdi)
+ add $0x100, %rdi
+ cmp %r9, %rdi
+ jb L(gobble_256bytes_nt_loop)
+ sfence
+ vmovups -0x100(%rcx), %zmm0
+ vmovups -0xC0(%rcx), %zmm1
+ vmovups -0x80(%rcx), %zmm2
+ vmovups -0x40(%rcx), %zmm3
+ vmovups %zmm0, (%r9)
+ vmovups %zmm1, 0x40(%r9)
+ vmovups %zmm2, 0x80(%r9)
+ vmovups %zmm3, 0xC0(%r9)
+ ret
+#else
+/* Memmove implementation.  */
+L(1024bytesormore):
+ cmp %rsi, %rdi
+ ja L(1024bytesormore_bkw)
+ prefetcht1 (%rsi)
+ prefetcht1 0x40(%rsi)
+ prefetcht1 0x80(%rsi)
+ prefetcht1 0xC0(%rsi)
+
+/* Loop with unaligned memory access.  */
+L(gobble_256bytes_loop):
+ vmovups (%rsi), %zmm0
+ prefetcht1 0x100(%rsi)
+ vmovups 0x40(%rsi), %zmm1
+ prefetcht1 0x140(%rsi)
+ vmovups 0x80(%rsi), %zmm2
+ prefetcht1 0x180(%rsi)
+ vmovups 0xC0(%rsi), %zmm3
+ prefetcht1 0x1C0(%rsi)
+ vmovups %zmm0, (%rdi)
+ vmovups %zmm1, 0x40(%rdi)
+ vmovups %zmm2, 0x80(%rdi)
+ vmovups %zmm3, 0xC0(%rdi)
+ sub $256, %rdx
+ add $256, %rsi
+ add $256, %rdi
+ cmp $256, %rdx
+ jae L(gobble_256bytes_loop)
+ jmp L(check)
+
+L(1024bytesormore_bkw):
+ prefetcht1 -0x100(%rcx)
+ prefetcht1 -0xC0(%rcx)
+ prefetcht1 -0x80(%rcx)
+ prefetcht1 -0x40(%rcx)
+
+/* Loop with unaligned memory access.  */
+L(gobble_256bytes_loop_bkw):
+ vmovups -0x100(%rcx), %zmm0
+ prefetcht1 -0x200(%rcx)
+ vmovups -0xC0(%rcx), %zmm1
+ prefetcht1 -0x1C0(%rcx)
+ vmovups -0x80(%rcx), %zmm2
+ prefetcht1 -0x180(%rcx)
+ vmovups -0x40(%rcx), %zmm3
+ prefetcht1 -0x140(%rcx)
+ vmovups %zmm0, -0x100(%r9)
+ vmovups %zmm1, -0xC0(%r9)
+ vmovups %zmm2, -0x80(%r9)
+ vmovups %zmm3, -0x40(%r9)
+ sub $256, %rdx
+ sub $256, %rcx
+ sub $256, %r9
+ cmp $256, %rdx
+ jae L(gobble_256bytes_loop_bkw)
+ jmp L(check)
+
+L(preloop_large):
+ cmp %rsi, %rdi
+ ja L(preloop_large_bkw)
+ vmovups (%rsi), %zmm4
+ vmovups 0x40(%rsi), %zmm5
+
+/* Align destination for access with non-temporal stores in the loop.  */
+ mov %rdi, %r8
+ and $-0x80, %rdi
+ add $0x80, %rdi
+ sub %rdi, %r8
+ sub %r8, %rsi
+ add %r8, %rdx
+ prefetcht1 (%rsi)
+ prefetcht1 0x40(%rsi)
+ prefetcht1 0x80(%rsi)
+ prefetcht1 0xC0(%rsi)
+L(gobble_256bytes_nt_loop):
+ vmovups (%rsi), %zmm0
+ prefetcht1 0x100(%rsi)
+ vmovups 0x40(%rsi), %zmm1
+ prefetcht1 0x140(%rsi)
+ vmovups 0x80(%rsi), %zmm2
+ prefetcht1 0x180(%rsi)
+ vmovups 0xC0(%rsi), %zmm3
+ prefetcht1 0x1C0(%rsi)
+ vmovntdq %zmm0, (%rdi)
+ vmovntdq %zmm1, 0x40(%rdi)
+ vmovntdq %zmm2, 0x80(%rdi)
+ vmovntdq %zmm3, 0xC0(%rdi)
+ sub $256, %rdx
+ add $256, %rsi
+ add $256, %rdi
+ cmp $256, %rdx
+ ja L(gobble_256bytes_nt_loop)
+ sfence
+ vmovups %zmm4, (%rax)
+ vmovups %zmm5, 0x40(%rax)
+ jmp L(check)
+
+L(preloop_large_bkw):
+ vmovups -0x80(%rcx), %zmm4
+ vmovups -0x40(%rcx), %zmm5
+
+/* Align end of destination for access with non-temporal stores.  */
+ mov %r9, %r8
+ and $-0x80, %r9
+ sub %r9, %r8
+ sub %r8, %rcx
+ sub %r8, %rdx
+ add %r9, %r8
+ prefetcht1 -0x100(%rcx)
+ prefetcht1 -0xC0(%rcx)
+ prefetcht1 -0x80(%rcx)
+ prefetcht1 -0x40(%rcx)
+L(gobble_256bytes_nt_loop_bkw):
+ vmovups -0x100(%rcx), %zmm0
+ prefetcht1 -0x200(%rcx)
+ vmovups -0xC0(%rcx), %zmm1
+ prefetcht1 -0x1C0(%rcx)
+ vmovups -0x80(%rcx), %zmm2
+ prefetcht1 -0x180(%rcx)
+ vmovups -0x40(%rcx), %zmm3
+ prefetcht1 -0x140(%rcx)
+ vmovntdq %zmm0, -0x100(%r9)
+ vmovntdq %zmm1, -0xC0(%r9)
+ vmovntdq %zmm2, -0x80(%r9)
+ vmovntdq %zmm3, -0x40(%r9)
+ sub $256, %rdx
+ sub $256, %rcx
+ sub $256, %r9
+ cmp $256, %rdx
+ ja L(gobble_256bytes_nt_loop_bkw)
+ sfence
+ vmovups %zmm4, -0x80(%r8)
+ vmovups %zmm5, -0x40(%r8)
+ jmp L(check)
+#endif
+END (MEMCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy.S
b/sysdeps/x86_64/multiarch/memcpy.S
index 27fca29..64a1bcd 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -30,19 +30,27 @@ 
 ENTRY(__new_memcpy)
  .type __new_memcpy, @gnu_indirect_function
  LOAD_RTLD_GLOBAL_RO_RDX
- leaq __memcpy_avx_unaligned(%rip), %rax
+#ifdef HAVE_AVX512_ASM_SUPPORT
+ HAS_ARCH_FEATURE (AVX512F_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jz 1f
+ leaq    __memcpy_avx512_no_vzeroupper(%rip), %rax
+ ret
+#endif
+1: leaq __memcpy_avx_unaligned(%rip), %rax
  HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 1f
+ jz 2f
  ret
-1: leaq __memcpy_sse2(%rip), %rax
+2: leaq __memcpy_sse2(%rip), %rax
  HAS_ARCH_FEATURE (Slow_BSF)
- jnz 2f
+ jnz 3f
  leaq __memcpy_sse2_unaligned(%rip), %rax
  ret
-2: HAS_CPU_FEATURE (SSSE3)
- jz 3f
+3: HAS_CPU_FEATURE (SSSE3)
+ jz 4f
  leaq    __memcpy_ssse3(%rip), %rax
-3: ret
+4: ret
 END(__new_memcpy)

 # undef ENTRY
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S
b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 6476c62..a0d56d4 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -30,7 +30,15 @@ 
 ENTRY(__memcpy_chk)
  .type __memcpy_chk, @gnu_indirect_function
  LOAD_RTLD_GLOBAL_RO_RDX
- leaq __memcpy_chk_sse2(%rip), %rax
+#ifdef HAVE_AVX512_ASM_SUPPORT
+ HAS_ARCH_FEATURE (AVX512F_Usable)
+ jz      1f
+# HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+# jz      1f
+ leaq    __memcpy_avx512_no_vzeroupper(%rip), %rax
+ ret
+#endif
+1: leaq __memcpy_chk_sse2(%rip), %rax
  HAS_CPU_FEATURE (SSSE3)
  jz 2f
  leaq __memcpy_chk_ssse3(%rip), %rax
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
new file mode 100644
index 0000000..aaad0c5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
@@ -0,0 +1,22 @@ 
+/* memmove optimized with AVX512 for KNL hardware.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_avx512_no_vzeroupper
+#define MEMCPY_CHK __memmove_chk_avx512_no_vzeroupper
+#include "memcpy-avx512-no-vzeroupper.S"
diff --git a/sysdeps/x86_64/multiarch/memmove.c
b/sysdeps/x86_64/multiarch/memmove.c
index e844531..8da5640 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -36,6 +36,9 @@  extern __typeof (__redirect_memmove) __memmove_sse2
attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
+# ifdef HAVE_AVX512_ASM_SUPPORT
+  extern __typeof (__redirect_memmove) __memmove_avx512_no_vzeroupper
attribute_hidden;
+# endif

 #endif

@@ -49,12 +52,18 @@  extern __typeof (__redirect_memmove)
__memmove_avx_unaligned attribute_hidden;
    ifunc symbol properly.  */
 extern __typeof (__redirect_memmove) __libc_memmove;
 libc_ifunc (__libc_memmove,
-    HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+#ifdef HAVE_AVX512_ASM_SUPPORT
+    HAS_ARCH_FEATURE (AVX512F_Usable)
+      && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+    ? __memmove_avx512_no_vzeroupper
+    :
+#endif
+    (HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
     ? __memmove_avx_unaligned
     : (HAS_CPU_FEATURE (SSSE3)
        ? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
           ? __memmove_ssse3_back : __memmove_ssse3)
-       : __memmove_sse2));
+       : __memmove_sse2)));

 strong_alias (__libc_memmove, memmove)

diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c
b/sysdeps/x86_64/multiarch/memmove_chk.c
index 60ed98f..f64da63 100644
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -26,10 +26,19 @@  extern __typeof (__memmove_chk) __memmove_chk_sse2
attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
+# ifdef HAVE_AVX512_ASM_SUPPORT
+  extern __typeof (__memmove_chk) __memmove_chk_avx512_no_vzeroupper
attribute_hidden;
+# endif

 #include "debug/memmove_chk.c"

 libc_ifunc (__memmove_chk,
+#ifdef HAVE_AVX512_ASM_SUPPORT
+    HAS_ARCH_FEATURE (AVX512F_Usable)
+      && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+    ? __memmove_chk_avx512_no_vzeroupper
+    :
+#endif
     HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) ? __memmove_chk_avx_unaligned :
     (HAS_CPU_FEATURE (SSSE3)
     ? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
b/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
new file mode 100644
index 0000000..ccffb31
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
@@ -0,0 +1,22 @@ 
+/* mempcpy optimized with AVX512 for KNL hardware.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_avx512_no_vzeroupper
+#define MEMCPY_CHK __mempcpy_chk_avx512_no_vzeroupper
+#include "memcpy-avx512-no-vzeroupper.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S
b/sysdeps/x86_64/multiarch/mempcpy.S
index 96ffb28..ed78623 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -28,7 +28,15 @@ 
 ENTRY(__mempcpy)
  .type __mempcpy, @gnu_indirect_function
  LOAD_RTLD_GLOBAL_RO_RDX
- leaq __mempcpy_sse2(%rip), %rax
+#ifdef HAVE_AVX512_ASM_SUPPORT
+ HAS_ARCH_FEATURE (AVX512F_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jz 1f
+ leaq    __mempcpy_avx512_no_vzeroupper(%rip), %rax
+ ret
+#endif
+1: leaq __mempcpy_sse2(%rip), %rax
  HAS_CPU_FEATURE (SSSE3)
  jz 2f
  leaq __mempcpy_ssse3(%rip), %rax
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S
b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index ae7a765..6e8a89d 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -30,7 +30,15 @@ 
 ENTRY(__mempcpy_chk)
  .type __mempcpy_chk, @gnu_indirect_function
  LOAD_RTLD_GLOBAL_RO_RDX
- leaq __mempcpy_chk_sse2(%rip), %rax
+#ifdef HAVE_AVX512_ASM_SUPPORT
+ HAS_ARCH_FEATURE (AVX512F_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jz 1f
+ leaq    __mempcpy_chk_avx512_no_vzeroupper(%rip), %rax
+ ret
+#endif
+1: leaq __mempcpy_chk_sse2(%rip), %rax
  HAS_CPU_FEATURE (SSSE3)
  jz 2f
  leaq __mempcpy_chk_ssse3(%rip), %rax