[RFC] Imporve 64bit memset performance for Haswell CPU with AVX2 instruction

Message ID 1396850238-29041-1-git-send-email-ling.ma@alipay.com
State Superseded
Headers

Commit Message

ling.ma.program@gmail.com April 7, 2014, 5:57 a.m. UTC
  From: Ling Ma <ling.ml@alibaba-inc.com>

In this patch we take advantage of HSW memory bandwidth, manage to
reduce miss branch prediction by avoid using branch instructions and
force destination to be aligned with avx instruction. 

The CPU2006 403.gcc benchmark also indicate this patch improves performance
from  22.9% to 59% compared with original memset implemented by sse2.

					memset-AVX		memset-SSE2		AVX vs SSE2		 
	gcc.166.i		1877958334		2495113045		1.328630673
	gcc.200.i		3507448572		4869401205		1.388302952
	gcc.cp-decl.i	1742510758		2282801367		1.310064432
	gcc.c-typeck.i	9546331594		12158804366		1.273662479
	gcc.expr2.i		5067111165		6470777800		1.277015165
	gcc.expr.i		3434703577		4420252661		1.286938614
	gcc.g23.i		5141096267		6318410858		1.22900069
	gcc.s04.i		8652255048		10923077090		1.262454358
	gcc.scilab.i	1209694573		1925173588		1.591454265

---
 We fixed code and re-test all cases, including sse2 and avx2.

 ChangeLog                              |   9 ++
 sysdeps/x86_64/multiarch/Makefile      |   4 +-
 sysdeps/x86_64/multiarch/memset-avx2.S | 192 +++++++++++++++++++++++++++++++++
 sysdeps/x86_64/multiarch/memset.S      |  59 ++++++++++
 sysdeps/x86_64/multiarch/memset_chk.S  |  44 ++++++++
 5 files changed, 307 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/memset.S
 create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S
  

Comments

ling.ma.program@gmail.com April 9, 2014, 2:51 a.m. UTC | #1
Any comments about memcpy/memset/avx detection patches?

Thanks
Ling

2014-04-07 13:57 GMT+08:00, ling.ma.program@gmail.com
<ling.ma.program@gmail.com>:
> From: Ling Ma <ling.ml@alibaba-inc.com>
>
> In this patch we take advantage of HSW memory bandwidth, manage to
> reduce miss branch prediction by avoid using branch instructions and
> force destination to be aligned with avx instruction.
>
> The CPU2006 403.gcc benchmark also indicate this patch improves performance
> from  22.9% to 59% compared with original memset implemented by sse2.
>
> 					memset-AVX		memset-SSE2		AVX vs SSE2		
> 	gcc.166.i		1877958334		2495113045		1.328630673
> 	gcc.200.i		3507448572		4869401205		1.388302952
> 	gcc.cp-decl.i	1742510758		2282801367		1.310064432
> 	gcc.c-typeck.i	9546331594		12158804366		1.273662479
> 	gcc.expr2.i		5067111165		6470777800		1.277015165
> 	gcc.expr.i		3434703577		4420252661		1.286938614
> 	gcc.g23.i		5141096267		6318410858		1.22900069
> 	gcc.s04.i		8652255048		10923077090		1.262454358
> 	gcc.scilab.i	1209694573		1925173588		1.591454265
>
> ---
>  We fixed code and re-test all cases, including sse2 and avx2.
>
>  ChangeLog                              |   9 ++
>  sysdeps/x86_64/multiarch/Makefile      |   4 +-
>  sysdeps/x86_64/multiarch/memset-avx2.S | 192
> +++++++++++++++++++++++++++++++++
>  sysdeps/x86_64/multiarch/memset.S      |  59 ++++++++++
>  sysdeps/x86_64/multiarch/memset_chk.S  |  44 ++++++++
>  5 files changed, 307 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/memset.S
>  create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S
>
> diff --git a/ChangeLog b/ChangeLog
> index ab23a3a..851fe9e 100644
> --- a/ChangeLog
> +++ b/ChangeLog
> @@ -1,3 +1,12 @@
> +2014-04-04  Ling Ma  <ling.ml@alibaba-inc.com>
> +
> +	* sysdeps/x86_64/multiarch/Makefile: Add memset-avx2
> +	* sysdeps/x86_64/multiarch/memset-avx2.S: New file for AVX2 memset
> +	* sysdeps/x86_64/multiarch/memset.S: New file for multiple memset
> +	versions
> +	* sysdeps/x86_64/multiarch/memset_chk.S: New file for multiple memset_chk
> +	versions
> +
>  2014-04-04  Sihai Yao  <sihai.ysh@alibaba-inc.com>
>
>  	* sysdeps/x86_64/multiarch/ifunc-defines.sym: Add COMMON_CPU_INDEX_7 and
> diff --git a/sysdeps/x86_64/multiarch/Makefile
> b/sysdeps/x86_64/multiarch/Makefile
> index 57a3c13..42df96f 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -17,7 +17,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c
> strcmp-ssse3 \
>  		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
>  		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
>  		   strcat-sse2-unaligned strncat-sse2-unaligned \
> -		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned
> +		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
> +		   memset-avx2
> +
>  ifeq (yes,$(config-cflags-sse4))
>  sysdep_routines += strcspn-c strpbrk-c strspn-c varshift
>  CFLAGS-varshift.c += -msse4
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S
> b/sysdeps/x86_64/multiarch/memset-avx2.S
> new file mode 100644
> index 0000000..5d4a487
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
> @@ -0,0 +1,192 @@
> +/* memset with AVX2
> +   Copyright (C) 2014 Free Software Foundation, Inc.
> +   Contributed by Alibaba Group.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#if !defined NOT_IN_libc
> +
> +#include "asm-syntax.h"
> +#ifndef ALIGN
> +# define ALIGN(n)	.p2align n
> +#endif
> +#ifndef MEMSET
> +# define MEMSET	__memset_avx2
> +# define MEMSET_CHK	__memset_chk_avx2
> +#endif
> +
> +	.section .text.avx2,"ax",@progbits
> +#if defined PIC
> +ENTRY (MEMSET_CHK)
> +	cmpq	%rdx, %rcx
> +	jb	HIDDEN_JUMPTARGET (__chk_fail)
> +END (MEMSET_CHK)
> +#endif
> +
> +ENTRY (MEMSET)
> +	vpxor	%xmm0, %xmm0, %xmm0
> +	vmovd %esi, %xmm1
> +	lea	(%rdi, %rdx), %r8
> +	vpshufb	%xmm0, %xmm1, %xmm0
> +	mov	%rdi, %rax
> +	cmp	$256, %rdx
> +	jae	L(256bytesormore)
> +	vmovd %xmm0, %rcx
> +	cmp	$128, %rdx
> +	jb	L(less_128bytes)
> +	vmovups %xmm0, (%rdi)
> +	vmovups %xmm0, 0x10(%rdi)
> +	vmovups %xmm0, 0x20(%rdi)
> +	vmovups %xmm0, 0x30(%rdi)
> +	vmovups %xmm0, 0x40(%rdi)
> +	vmovups %xmm0, 0x50(%rdi)
> +	vmovups %xmm0, 0x60(%rdi)
> +	vmovups %xmm0, 0x70(%rdi)
> +	vmovups %xmm0, -0x80(%r8)
> +	vmovups %xmm0, -0x70(%r8)
> +	vmovups %xmm0, -0x60(%r8)
> +	vmovups %xmm0, -0x50(%r8)
> +	vmovups %xmm0, -0x40(%r8)
> +	vmovups %xmm0, -0x30(%r8)
> +	vmovups %xmm0, -0x20(%r8)
> +	vmovups %xmm0, -0x10(%r8)
> +	ret
> +	ALIGN(4)
> +L(less_128bytes):
> +	cmp	$64, %edx
> +	jb	L(less_64bytes)
> +	vmovups %xmm0, (%rdi)
> +	vmovups %xmm0, 0x10(%rdi)
> +	vmovups %xmm0, 0x20(%rdi)
> +	vmovups %xmm0, 0x30(%rdi)
> +	vmovups %xmm0, -0x40(%r8)
> +	vmovups %xmm0, -0x30(%r8)
> +	vmovups %xmm0, -0x20(%r8)
> +	vmovups %xmm0, -0x10(%r8)
> +	ret
> +	ALIGN(4)
> +L(less_64bytes):
> +	cmp	$32, %edx
> +	jb	L(less_32bytes)
> +	vmovups %xmm0, (%rdi)
> +	vmovups %xmm0, 0x10(%rdi)
> +	vmovups %xmm0, -0x20(%r8)
> +	vmovups %xmm0, -0x10(%r8)
> +	ret
> +	ALIGN(4)
> +L(less_32bytes):
> +	cmp	$16, %edx
> +	jb	L(less_16bytes)
> +	vmovups %xmm0, (%rdi)
> +	vmovups %xmm0, -0x10(%r8)
> +	ret
> +	ALIGN(4)
> +L(less_16bytes):
> +	cmp	$8, %edx
> +	jb	L(less_8bytes)
> +	mov %rcx, (%rdi)
> +	mov %rcx, -0x08(%r8)
> +	ret
> +	ALIGN(4)
> +L(less_8bytes):
> +	cmp	$4, %edx
> +	jb	L(less_4bytes)
> +	mov %ecx, (%rdi)
> +	mov %ecx, -0x04(%r8)
> +	ALIGN(4)
> +L(less_4bytes):
> +	cmp	$2, %edx
> +	jb	L(less_2bytes)
> +	mov	%cx, (%rdi)
> +	mov	%cx, -0x02(%r8)
> +	ret
> +	ALIGN(4)
> +L(less_2bytes):
> +	cmp	$1, %edx
> +	jb	L(less_1bytes)
> +	mov	%cl, (%rdi)
> +L(less_1bytes):
> +	ret
> +
> +	ALIGN(4)
> +L(256bytesormore):
> +	vinserti128 $1, %xmm0, %ymm0, %ymm0
> +	vmovups	%ymm0, (%rdi)
> +	mov	%rdi, %r9
> +	and	$-0x20, %rdi
> +	add	$32, %rdi
> +	sub	%rdi, %r9
> +	add	%r9, %rdx
> +	cmp	$4096, %rdx
> +	ja	L(gobble_data)
> +
> +	sub	$0x80, %rdx
> +L(gobble_128_loop):
> +	vmovaps	%ymm0, (%rdi)
> +	vmovaps	%ymm0, 0x20(%rdi)
> +	vmovaps	%ymm0, 0x40(%rdi)
> +	vmovaps	%ymm0, 0x60(%rdi)
> +	lea	0x80(%rdi), %rdi
> +	sub	$0x80, %rdx
> +	jae	L(gobble_128_loop)
> +	vmovups	%ymm0, -0x80(%r8)
> +	vmovups	%ymm0, -0x60(%r8)
> +	vmovups	%ymm0, -0x40(%r8)
> +	vmovups	%ymm0, -0x20(%r8)
> +	vzeroupper
> +	ret
> +
> +	ALIGN(4)
> +L(gobble_data):
> +#ifdef SHARED_CACHE_SIZE_HALF
> +	mov	$SHARED_CACHE_SIZE_HALF, %r9
> +#else
> +	mov	__x86_shared_cache_size_half(%rip), %r9
> +#endif
> +	shl	$4, %r9
> +	cmp	%r9, %rdx
> +	ja	L(gobble_big_data)
> +	mov	%rax, %r9
> +	mov	%esi, %eax
> +	mov	%rdx, %rcx
> +	rep	stosb
> +	mov	%r9, %rax
> +	vzeroupper
> +	ret
> +
> +	ALIGN(4)
> +L(gobble_big_data):
> +	sub	$0x80, %rdx
> +L(gobble_big_data_loop):
> +	vmovntdq	%ymm0, (%rdi)
> +	vmovntdq	%ymm0, 0x20(%rdi)
> +	vmovntdq	%ymm0, 0x40(%rdi)
> +	vmovntdq	%ymm0, 0x60(%rdi)
> +	lea	0x80(%rdi), %rdi
> +	sub	$0x80, %rdx
> +	jae	L(gobble_big_data_loop)
> +	vmovups	%ymm0, -0x80(%r8)
> +	vmovups	%ymm0, -0x60(%r8)
> +	vmovups	%ymm0, -0x40(%r8)
> +	vmovups	%ymm0, -0x20(%r8)
> +	vzeroupper
> +	sfence
> +	ret
> +
> +END (MEMSET)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/memset.S
> b/sysdeps/x86_64/multiarch/memset.S
> new file mode 100644
> index 0000000..df903af
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset.S
> @@ -0,0 +1,59 @@
> +/* Multiple versions of memset
> +   Copyright (C) 2014 Free Software Foundation, Inc.
> +   Contributed by Alibaba Group.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <shlib-compat.h>
> +#include <init-arch.h>
> +
> +/* Define multiple versions only for the definition in lib.  */
> +#ifndef NOT_IN_libc
> +ENTRY(memset)
> +	.type	memset, @gnu_indirect_function
> +	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
> +	jne	1f
> +	call	__init_cpu_features
> +1:	leaq	__memset_sse2(%rip), %rax
> +	testl	$bit_AVX2_Usable,
> __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
> +	jz	2f
> +	leaq	__memset_avx2(%rip), %rax
> +2:	ret
> +END(memset)
> +#endif
> +
> +#if !defined NOT_IN_libc
> +# undef memset
> +# define memset __memset_sse2
> +
> +# undef __memset_chk
> +# define __memset_chk __memset_chk_sse2
> +
> +# ifdef SHARED
> +#  undef libc_hidden_builtin_def
> +/* It doesn't make sense to send libc-internal memset calls through a PLT.
> +   The speedup we get from using GPR instruction is likely eaten away
> +   by the indirect call in the PLT.  */
> +#  define libc_hidden_builtin_def(name) \
> +	.globl __GI_memset; __GI_memset = __memset_sse2
> +# endif
> +
> +# undef strong_alias
> +# define strong_alias(original, alias)
> +#endif
> +
> +#include "../memset.S"
> diff --git a/sysdeps/x86_64/multiarch/memset_chk.S
> b/sysdeps/x86_64/multiarch/memset_chk.S
> new file mode 100644
> index 0000000..f048dac
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset_chk.S
> @@ -0,0 +1,44 @@
> +/* Multiple versions of memset_chk
> +   Copyright (C) 2014 Free Software Foundation, Inc.
> +   Contributed by Alibaba Group.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <init-arch.h>
> +
> +/* Define multiple versions only for the definition in lib.  */
> +#ifndef NOT_IN_libc
> +# ifdef SHARED
> +ENTRY(__memset_chk)
> +	.type	__memset_chk, @gnu_indirect_function
> +	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
> +	jne	1f
> +	call	__init_cpu_features
> +1:	leaq	__memset_chk_sse2(%rip), %rax
> +	testl	$bit_AVX2_Usable,
> __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
> +	jz	2f
> +	leaq	__memset_chk_avx2(%rip), %rax
> +2:	ret
> +END(__memset_chk)
> +
> +strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
> +	.section .gnu.warning.__memset_zero_constant_len_parameter
> +	.string "memset used with constant zero length parameter; this could be
> due to transposed parameters"
> +# else
> +#  include "../memset_chk.S"
> +# endif
> +#endif
> --
> 1.8.1.4
>
>
  
Ondrej Bilka April 10, 2014, 10:27 p.m. UTC | #2
On Mon, Apr 07, 2014 at 01:57:18AM -0400, ling.ma.program@gmail.com wrote:
> From: Ling Ma <ling.ml@alibaba-inc.com>
> 
> In this patch we take advantage of HSW memory bandwidth, manage to
> reduce miss branch prediction by avoid using branch instructions and
> force destination to be aligned with avx instruction. 
> 
> The CPU2006 403.gcc benchmark also indicate this patch improves performance
> from  22.9% to 59% compared with original memset implemented by sse2.
>
Looks mostly ok except mostly mechanic changes.


> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
> new file mode 100644
> index 0000000..5d4a487
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
> @@ -0,0 +1,192 @@
> +/* memset with AVX2
> +   Copyright (C) 2014 Free Software Foundation, Inc.
> +   Contributed by Alibaba Group.

We no longer add Contributed by.

> +#include "asm-syntax.h"
> +#ifndef ALIGN
> +# define ALIGN(n)	.p2align n
> +#endif

Also in meantime we decided to remove ALIGN macro so remove that and
s/ALIGN(3)/.p2align 4/  s/ALIGN(4)/.p2align 4/

> +
> +ENTRY (MEMSET)
> +	vpxor	%xmm0, %xmm0, %xmm0
> +	vmovd %esi, %xmm1
> +	lea	(%rdi, %rdx), %r8

< snip >

> +	vmovups %xmm0, 0x70(%rdi)
> +	vmovups %xmm0, -0x80(%r8)

I would globally replace %r8 by %rsi, this makes instruction byte shorter, %r9 is similar.

> +L(less_4bytes):
> +	cmp	$2, %edx
> +	jb	L(less_2bytes)
> +	mov	%cx, (%rdi)
> +	mov	%cx, -0x02(%r8)
> +	ret
> +	ALIGN(4)
> +L(less_2bytes):
> +	cmp	$1, %edx
> +	jb	L(less_1bytes)
> +	mov	%cl, (%rdi)
> +L(less_1bytes):
> +	ret
> +

Here current implementation saves one comparison by

L(less_4bytes):
     cmp     $1, %edx
     jbe     L(less_2bytes)
     mov     %cx, (%rdi)
     mov     %cx, -0x02(%r8)
     ret
     ALIGN(4)
L(less_2bytes):
     jb      L(less_1bytes)
     mov     %cl, (%rdi)
L(less_1bytes):
     ret

> +	ALIGN(4)
> +L(256bytesormore):
> +	vinserti128 $1, %xmm0, %ymm0, %ymm0
> +	vmovups	%ymm0, (%rdi)
> +	mov	%rdi, %r9
> +	and	$-0x20, %rdi
> +	add	$32, %rdi
> +	sub	%rdi, %r9
> +	add	%r9, %rdx
> +	cmp	$4096, %rdx
> +	ja	L(gobble_data)
> +
> +	sub	$0x80, %rdx
> +L(gobble_128_loop):
> +	vmovaps	%ymm0, (%rdi)
> +	vmovaps	%ymm0, 0x20(%rdi)
> +	vmovaps	%ymm0, 0x40(%rdi)
> +	vmovaps	%ymm0, 0x60(%rdi)
> +	lea	0x80(%rdi), %rdi
> +	sub	$0x80, %rdx
> +	jae	L(gobble_128_loop)
> +	vmovups	%ymm0, -0x80(%r8)
> +	vmovups	%ymm0, -0x60(%r8)
> +	vmovups	%ymm0, -0x40(%r8)
> +	vmovups	%ymm0, -0x20(%r8)
> +	vzeroupper
> +	ret
> +

I looked into this by objdump and loop is misaligned by 5 bytes which 
could be problem if haswell could not handle that. 
If you align loop as below does that it improve performance? 

	.p2align 4
	ret; ret; ret; ret; ret
L(256bytesormore):


also in this pattern

> +
> +	sub	$0x80, %rdx

A gcc saves three bytes by using
	add	$-0x80, %rdx

Third possible optimization is move vmovups before loop which improves
latency but it needs to be tested on haswell.


> +	ALIGN(4)
> +L(gobble_data):
> +#ifdef SHARED_CACHE_SIZE_HALF
> +	mov	$SHARED_CACHE_SIZE_HALF, %r9
> +#else
> +	mov	__x86_shared_cache_size_half(%rip), %r9
> +#endif

typo here, __x86_64_shared_cache_size_half

> +	shl	$4, %r9
> +	cmp	%r9, %rdx
> +	ja	L(gobble_big_data)
> +	mov	%rax, %r9
> +	mov	%esi, %eax
> +	mov	%rdx, %rcx
> +	rep	stosb

How does this compares with stosq equivalent?

> +	mov	%r9, %rax
> +	vzeroupper
> +	ret
> +
  
ling.ma.program@gmail.com April 18, 2014, 7:20 a.m. UTC | #3
Hi Ondra,

Thanks for your comments, and changed as below,  the new version have
been sent to you.

Regards
Ling

>
>> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S
>> b/sysdeps/x86_64/multiarch/memset-avx2.S
>> new file mode 100644
>> index 0000000..5d4a487
>> --- /dev/null
>> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
>> @@ -0,0 +1,192 @@
>> +/* memset with AVX2
>> +   Copyright (C) 2014 Free Software Foundation, Inc.
>> +   Contributed by Alibaba Group.
>
> We no longer add Contributed by.
Removed in new version.
>
>> +#include "asm-syntax.h"
>> +#ifndef ALIGN
>> +# define ALIGN(n)	.p2align n
>> +#endif
>
> Also in meantime we decided to remove ALIGN macro so remove that and
> s/ALIGN(3)/.p2align 4/  s/ALIGN(4)/.p2align 4/
Fixed in new version
>
>> +
>> +ENTRY (MEMSET)
>> +	vpxor	%xmm0, %xmm0, %xmm0
>> +	vmovd %esi, %xmm1
>> +	lea	(%rdi, %rdx), %r8
>
> < snip >
>
>> +	vmovups %xmm0, 0x70(%rdi)
>> +	vmovups %xmm0, -0x80(%r8)
>
> I would globally replace %r8 by %rsi, this makes instruction byte shorter,
> %r9 is similar.
Fixed in new version, and we make branch instruction in different
16byte to improve branch prediction accurate.

>
>> +L(less_4bytes):
>> +	cmp	$2, %edx
>> +	jb	L(less_2bytes)
>> +	mov	%cx, (%rdi)
>> +	mov	%cx, -0x02(%r8)
>> +	ret
>> +	ALIGN(4)
>> +L(less_2bytes):
>> +	cmp	$1, %edx
>> +	jb	L(less_1bytes)
>> +	mov	%cl, (%rdi)
>> +L(less_1bytes):
>> +	ret
>> +
>
> Here current implementation saves one comparison by
>
Did in new version.
> L(less_4bytes):
>      cmp     $1, %edx
>      jbe     L(less_2bytes)
>      mov     %cx, (%rdi)
>      mov     %cx, -0x02(%r8)
>      ret
>      ALIGN(4)
> L(less_2bytes):
>      jb      L(less_1bytes)
>      mov     %cl, (%rdi)
> L(less_1bytes):
>      ret
>
>> +	ALIGN(4)
>> +L(256bytesormore):
>> +	vinserti128 $1, %xmm0, %ymm0, %ymm0
>> +	vmovups	%ymm0, (%rdi)
>> +	mov	%rdi, %r9
>> +	and	$-0x20, %rdi
>> +	add	$32, %rdi
>> +	sub	%rdi, %r9
>> +	add	%r9, %rdx
>> +	cmp	$4096, %rdx
>> +	ja	L(gobble_data)
>> +
>> +	sub	$0x80, %rdx
>> +L(gobble_128_loop):
>> +	vmovaps	%ymm0, (%rdi)
>> +	vmovaps	%ymm0, 0x20(%rdi)
>> +	vmovaps	%ymm0, 0x40(%rdi)
>> +	vmovaps	%ymm0, 0x60(%rdi)
>> +	lea	0x80(%rdi), %rdi
>> +	sub	$0x80, %rdx
>> +	jae	L(gobble_128_loop)
>> +	vmovups	%ymm0, -0x80(%r8)
>> +	vmovups	%ymm0, -0x60(%r8)
>> +	vmovups	%ymm0, -0x40(%r8)
>> +	vmovups	%ymm0, -0x20(%r8)
>> +	vzeroupper
>> +	ret
>> +
>
> I looked into this by objdump and loop is misaligned by 5 bytes which
> could be problem if haswell could not handle that.
> If you align loop as below does that it improve performance?
>
Fixed the issue ,loop is aligned.
> 	.p2align 4
> 	ret; ret; ret; ret; ret
> L(256bytesormore):
>
>
> also in this pattern
>
>> +
>> +	sub	$0x80, %rdx
>
> A gcc saves three bytes by using
> 	add	$-0x80, %rdx
>
Changed  with similar method
> Third possible optimization is move vmovups before loop which improves
> latency but it needs to be tested on haswell.
>

Tested below mode, but it hurt performance, original code could get
benefit from hardware prefetch because of sequence access

vmovdqu %ymm0, -0x80(%rsi)
vmovdqu %ymm0, -0x60(%rsi)
vmovdqu %ymm0, -0x40(%rsi)
vmovdqu %ymm0, -0x20(%rsi)
sub %ecx, %edx
L(gobble_128_loop):
vmovdqa %ymm0, (%rdi)
vmovdqa %ymm0, 0x20(%rdi)
vmovdqa %ymm0, 0x40(%rdi)
vmovdqa %ymm0, 0x60(%rdi)
add %rcx, %rdi
sub %ecx, %edx
jae L(gobble_128_loop)
....
>
>> +	ALIGN(4)
>> +L(gobble_data):
>> +#ifdef SHARED_CACHE_SIZE_HALF
>> +	mov	$SHARED_CACHE_SIZE_HALF, %r9
>> +#else
>> +	mov	__x86_shared_cache_size_half(%rip), %r9
>> +#endif
>
> typo here, __x86_64_shared_cache_size_half
__x86_64_shared_cache_size_half will cause crash, so keep
__x86_shared_cache_size_half.

>
>> +	shl	$4, %r9
>> +	cmp	%r9, %rdx
>> +	ja	L(gobble_big_data)
>> +	mov	%rax, %r9
>> +	mov	%esi, %eax
>> +	mov	%rdx, %rcx
>> +	rep	stosb
>
> How does this compares with stosq equivalent?
yes, tested but no improvement.
>
>> +	mov	%r9, %rax
>> +	vzeroupper
>> +	ret
>> +
>
>
  
Ondrej Bilka May 13, 2014, 5:36 p.m. UTC | #4
On Mon, Apr 07, 2014 at 01:57:18AM -0400, ling.ma.program@gmail.com wrote:
> From: Ling Ma <ling.ml@alibaba-inc.com>
> 
> In this patch we take advantage of HSW memory bandwidth, manage to
> reduce miss branch prediction by avoid using branch instructions and
> force destination to be aligned with avx instruction. 
> 
Now when we have a haswell machine on our department I tested this
implementation. Benchmark used and results are here.

http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_avx130514.tar.bz2
http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_avx.html

This patch improves large inputs and does not regress
small inputs much which gives a total 10% improvement on gcc test, it
could be improved but it now looks good enough.

I tried two alternatives. First is using avx2 in header(memset_fuse). 
It look it helps, it adds additional 0.5% of performance. However I tried to
crosscheck this with bash shell where comparison is in opposite
direction so I not entirely sure yet, see

http://kam.mff.cuni.cz/~ondra/benchmark_string/haswell/memset_profile_avx/results_bash/result.html


Second is checking if rep treshold is best one,
this depends on application cache layout I do not have definite answer
yet (memset_rep and memset_avx_v2 variants), when data is in L2 cache we
could lower treshold to 1024 bytes but it slows real inputs for some reason.


> The CPU2006 403.gcc benchmark also indicate this patch improves performance
> from  22.9% to 59% compared with original memset implemented by sse2.
>
I inspected that benchmark with my profiler is not that good as its only simple
part of gcc and two third of total time is spend on 240 long inputs.

A large part of speedup could be explained that avx2 implementation has
a special case branch for 128-256 byte range but current one uses loop.
These distributions are different from other program and running gcc
itself as short inputs are more common there.


> +	ALIGN(4)
> +L(gobble_data):
> +#ifdef SHARED_CACHE_SIZE_HALF
> +	mov	$SHARED_CACHE_SIZE_HALF, %r9
> +#else
> +	mov	__x86_shared_cache_size_half(%rip), %r9
> +#endif
> +	shl	$4, %r9
> +	cmp	%r9, %rdx
> +	ja	L(gobble_big_data)
> +	mov	%rax, %r9
> +	mov	%esi, %eax
> +	mov	%rdx, %rcx
> +	rep	stosb
> +	mov	%r9, %rax
> +	vzeroupper
> +	ret
> +
> +	ALIGN(4)
> +L(gobble_big_data):
> +	sub	$0x80, %rdx
> +L(gobble_big_data_loop):
> +	vmovntdq	%ymm0, (%rdi)
> +	vmovntdq	%ymm0, 0x20(%rdi)
> +	vmovntdq	%ymm0, 0x40(%rdi)
> +	vmovntdq	%ymm0, 0x60(%rdi)
> +	lea	0x80(%rdi), %rdi
> +	sub	$0x80, %rdx
> +	jae	L(gobble_big_data_loop)
> +	vmovups	%ymm0, -0x80(%r8)
> +	vmovups	%ymm0, -0x60(%r8)
> +	vmovups	%ymm0, -0x40(%r8)
> +	vmovups	%ymm0, -0x20(%r8)
> +	vzeroupper
> +	sfence
> +	ret

That loop does seem to help on haswell at all, It is indistingushible from
rep stosb loop above. I used following benchmark to check that with
different sizes but performance stayed same.

#include <stdlib.h>
#include <string.h>
int main(){
 int i;
 char *x=malloc(100000000);
  for (i=0;i<100;i++)
   MEMSET(x,0,100000000);

}


for I in `seq 1 10`; do
echo avx
gcc -L. -DMEMSET=__memset_avx2 -lc_profile big.c
time LD_LIBRARY_PATH=. ./a.out
echo rep
gcc -L. -DMEMSET=__memset_rep -lc_profile big.c
time LD_LIBRARY_PATH=. ./a.out
done
  
ling.ma.program@gmail.com May 15, 2014, 1:05 a.m. UTC | #5
2014-05-14 1:36 GMT+08:00, Ondřej Bílka <neleai@seznam.cz>:
>
>
> On Mon, Apr 07, 2014 at 01:57:18AM -0400, ling.ma.program@gmail.com wrote:
>> From: Ling Ma <ling.ml@alibaba-inc.com>
>>
>> In this patch we take advantage of HSW memory bandwidth, manage to
>> reduce miss branch prediction by avoid using branch instructions and
>> force destination to be aligned with avx instruction.
>>
> Now when we have a haswell machine on our department I tested this
> implementation. Benchmark used and results are here.
>
> http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_avx130514.tar.bz2
> http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_avx.html
>
> This patch improves large inputs and does not regress
> small inputs much which gives a total 10% improvement on gcc test, it
> could be improved but it now looks good enough.
Ling: Thanks Ondra! you give us many good suggestions and encouragement

> I tried two alternatives. First is using avx2 in header(memset_fuse).
> It look it helps, it adds additional 0.5% of performance. However I tried
> to
> crosscheck this with bash shell where comparison is in opposite
> direction so I not entirely sure yet, see
>
> http://kam.mff.cuni.cz/~ondra/benchmark_string/haswell/memset_profile_avx/results_bash/result.html
>
Ling: Yes, we did the experiment on our 403.gcc(I list the download
address below), it slows performance between 0 and 256 bytes although
another benchmark gave us good result, so I said it hurt performance
in my last email.

>
> Second is checking if rep treshold is best one,
> this depends on application cache layout I do not have definite answer
> yet (memset_rep and memset_avx_v2 variants), when data is in L2 cache we
> could lower treshold to 1024 bytes but it slows real inputs for some
> reason.
>
Ling: Yes, because of this reason, I once tried to use prefetch
instruction in the code, and  it will also hurt performance when the
data is in L1.
>
>> The CPU2006 403.gcc benchmark also indicate this patch improves
>> performance
>> from  22.9% to 59% compared with original memset implemented by sse2.
>>
> I inspected that benchmark with my profiler is not that good as its only
> simple
> part of gcc and two third of total time is spend on 240 long inputs.
Ling: please download www.yunos.org/tmp/test.memcpy.memset.zip, which
includes our whole benchmark ,readme.txt and result.xls. we can run &
check it.

>
> A large part of speedup could be explained that avx2 implementation has
> a special case branch for 128-256 byte range but current one uses loop.
> These distributions are different from other program and running gcc
> itself as short inputs are more common there.
>
>
>> +	ALIGN(4)
>> +L(gobble_data):
>> +#ifdef SHARED_CACHE_SIZE_HALF
>> +	mov	$SHARED_CACHE_SIZE_HALF, %r9
>> +#else
>> +	mov	__x86_shared_cache_size_half(%rip), %r9
>> +#endif
>> +	shl	$4, %r9
>> +	cmp	%r9, %rdx
>> +	ja	L(gobble_big_data)
>> +	mov	%rax, %r9
>> +	mov	%esi, %eax
>> +	mov	%rdx, %rcx
>> +	rep	stosb
>> +	mov	%r9, %rax
>> +	vzeroupper
>> +	ret
>> +
>> +	ALIGN(4)
>> +L(gobble_big_data):
>> +	sub	$0x80, %rdx
>> +L(gobble_big_data_loop):
>> +	vmovntdq	%ymm0, (%rdi)
>> +	vmovntdq	%ymm0, 0x20(%rdi)
>> +	vmovntdq	%ymm0, 0x40(%rdi)
>> +	vmovntdq	%ymm0, 0x60(%rdi)
>> +	lea	0x80(%rdi), %rdi
>> +	sub	$0x80, %rdx
>> +	jae	L(gobble_big_data_loop)
>> +	vmovups	%ymm0, -0x80(%r8)
>> +	vmovups	%ymm0, -0x60(%r8)
>> +	vmovups	%ymm0, -0x40(%r8)
>> +	vmovups	%ymm0, -0x20(%r8)
>> +	vzeroupper
>> +	sfence
>> +	ret
>
> That loop does seem to help on haswell at all, It is indistingushible from
> rep stosb loop above. I used following benchmark to check that with
> different sizes but performance stayed same.
>
> #include <stdlib.h>
> #include <string.h>
> int main(){
>  int i;
>  char *x=malloc(100000000);
>   for (i=0;i<100;i++)
>    MEMSET(x,0,100000000);
>
> }
>
>
> for I in `seq 1 10`; do
> echo avx
> gcc -L. -DMEMSET=__memset_avx2 -lc_profile big.c
> time LD_LIBRARY_PATH=. ./a.out
> echo rep
> gcc -L. -DMEMSET=__memset_rep -lc_profile big.c
> time LD_LIBRARY_PATH=. ./a.out
> done

Ling: Ok, I will test it seriously, then send out new version.

Thanks!
Ling
  
Ondrej Bilka May 15, 2014, 8:14 p.m. UTC | #6
Correction, in for following

On Tue, May 13, 2014 at 07:36:16PM +0200, Ondřej Bílka wrote:
> > +	ALIGN(4)
> > +L(gobble_data):
> > +#ifdef SHARED_CACHE_SIZE_HALF
> > +	mov	$SHARED_CACHE_SIZE_HALF, %r9
> > +#else
> > +	mov	__x86_shared_cache_size_half(%rip), %r9
> > +#endif
> > +	shl	$4, %r9
> > +	cmp	%r9, %rdx
> > +	ja	L(gobble_big_data)
> > +	mov	%rax, %r9
> > +	mov	%esi, %eax
> > +	mov	%rdx, %rcx
> > +	rep	stosb
> > +	mov	%r9, %rax
> > +	vzeroupper
> > +	ret
> > +
> > +	ALIGN(4)
> > +L(gobble_big_data):
> > +	sub	$0x80, %rdx
> > +L(gobble_big_data_loop):
> > +	vmovntdq	%ymm0, (%rdi)
> > +	vmovntdq	%ymm0, 0x20(%rdi)
> > +	vmovntdq	%ymm0, 0x40(%rdi)
> > +	vmovntdq	%ymm0, 0x60(%rdi)
> > +	lea	0x80(%rdi), %rdi
> > +	sub	$0x80, %rdx
> > +	jae	L(gobble_big_data_loop)
> > +	vmovups	%ymm0, -0x80(%r8)
> > +	vmovups	%ymm0, -0x60(%r8)
> > +	vmovups	%ymm0, -0x40(%r8)
> > +	vmovups	%ymm0, -0x20(%r8)
> > +	vzeroupper
> > +	sfence
> > +	ret
> 
> That loop does seem to help on haswell at all, It is indistingushible from
> rep stosb loop above. I used following benchmark to check that with
> different sizes but performance stayed same.
> 
> #include <stdlib.h>
> #include <string.h>
> int main(){
>  int i;
>  char *x=malloc(100000000);
>   for (i=0;i<100;i++)
>    MEMSET(x,0,100000000);
> 
> }
> 
> 
> for I in `seq 1 10`; do
> echo avx
> gcc -L. -DMEMSET=__memset_avx2 -lc_profile big.c
> time LD_LIBRARY_PATH=. ./a.out
> echo rep
> gcc -L. -DMEMSET=__memset_rep -lc_profile big.c
> time LD_LIBRARY_PATH=. ./a.out
> done

Sorry I forgotten that __memset_rep also has branch for large inputs so
what I wrote was wrong.

I retested it with fixed rep stosq and your loop is around 20% slower on
similar test so its better to remove that loop.

$ gcc big.c -o big
$ time LD_PRELOAD=./memset-avx2.so ./big

real    0m0.076s
user    0m0.066s
sys     0m0.010s

$ time LD_PRELOAD=./memset_rep.so ./big

real    0m0.063s
user    0m0.042s
sys     0m0.021s

I use a different benchmark to be sure, it could be download here and
run it commands above in that directory.

http://kam.mff.cuni.cz/~ondra/memset_consistency_benchmark.tar.bz2

For different implementation you need to create .so with function
memset, there is script compile that compiles all .s files provided that
first line is of shape

# arch_requirement function_name color
  
ling.ma.program@gmail.com May 30, 2014, 9:02 a.m. UTC | #7
Hi all,

Here is latest memset pach: http://www.yunos.org/tmp/memset-avx2.patch

When I send patch by git-send-email, libc-alpha@sourceware.org refuse
to show it,
Sorry for  Inconvenience to you

Thanks
Ling


2014-05-16 4:14 GMT+08:00, Ondřej Bílka <neleai@seznam.cz>:
> Correction, in for following
>
> On Tue, May 13, 2014 at 07:36:16PM +0200, Ondřej Bílka wrote:
>> > +	ALIGN(4)
>> > +L(gobble_data):
>> > +#ifdef SHARED_CACHE_SIZE_HALF
>> > +	mov	$SHARED_CACHE_SIZE_HALF, %r9
>> > +#else
>> > +	mov	__x86_shared_cache_size_half(%rip), %r9
>> > +#endif
>> > +	shl	$4, %r9
>> > +	cmp	%r9, %rdx
>> > +	ja	L(gobble_big_data)
>> > +	mov	%rax, %r9
>> > +	mov	%esi, %eax
>> > +	mov	%rdx, %rcx
>> > +	rep	stosb
>> > +	mov	%r9, %rax
>> > +	vzeroupper
>> > +	ret
>> > +
>> > +	ALIGN(4)
>> > +L(gobble_big_data):
>> > +	sub	$0x80, %rdx
>> > +L(gobble_big_data_loop):
>> > +	vmovntdq	%ymm0, (%rdi)
>> > +	vmovntdq	%ymm0, 0x20(%rdi)
>> > +	vmovntdq	%ymm0, 0x40(%rdi)
>> > +	vmovntdq	%ymm0, 0x60(%rdi)
>> > +	lea	0x80(%rdi), %rdi
>> > +	sub	$0x80, %rdx
>> > +	jae	L(gobble_big_data_loop)
>> > +	vmovups	%ymm0, -0x80(%r8)
>> > +	vmovups	%ymm0, -0x60(%r8)
>> > +	vmovups	%ymm0, -0x40(%r8)
>> > +	vmovups	%ymm0, -0x20(%r8)
>> > +	vzeroupper
>> > +	sfence
>> > +	ret
>>
>> That loop does seem to help on haswell at all, It is indistingushible
>> from
>> rep stosb loop above. I used following benchmark to check that with
>> different sizes but performance stayed same.
>>
>> #include <stdlib.h>
>> #include <string.h>
>> int main(){
>>  int i;
>>  char *x=malloc(100000000);
>>   for (i=0;i<100;i++)
>>    MEMSET(x,0,100000000);
>>
>> }
>>
>>
>> for I in `seq 1 10`; do
>> echo avx
>> gcc -L. -DMEMSET=__memset_avx2 -lc_profile big.c
>> time LD_LIBRARY_PATH=. ./a.out
>> echo rep
>> gcc -L. -DMEMSET=__memset_rep -lc_profile big.c
>> time LD_LIBRARY_PATH=. ./a.out
>> done
>
> Sorry I forgotten that __memset_rep also has branch for large inputs so
> what I wrote was wrong.
>
> I retested it with fixed rep stosq and your loop is around 20% slower on
> similar test so its better to remove that loop.
>
> $ gcc big.c -o big
> $ time LD_PRELOAD=./memset-avx2.so ./big
>
> real    0m0.076s
> user    0m0.066s
> sys     0m0.010s
>
> $ time LD_PRELOAD=./memset_rep.so ./big
>
> real    0m0.063s
> user    0m0.042s
> sys     0m0.021s
>
> I use a different benchmark to be sure, it could be download here and
> run it commands above in that directory.
>
> http://kam.mff.cuni.cz/~ondra/memset_consistency_benchmark.tar.bz2
>
> For different implementation you need to create .so with function
> memset, there is script compile that compiles all .s files provided that
> first line is of shape
>
> # arch_requirement function_name color
>
>
  
Ondrej Bilka May 30, 2014, 11:30 a.m. UTC | #8
On Fri, May 30, 2014 at 05:02:29PM +0800, Ling Ma wrote:
> Hi all,
> 
> Here is latest memset pach: http://www.yunos.org/tmp/memset-avx2.patch
> 
> When I send patch by git-send-email, libc-alpha@sourceware.org refuse
> to show it,
> Sorry for  Inconvenience to you
> 
It looks its typo in adress, yonos.org does not exist.
  
ling.ma.program@gmail.com May 30, 2014, 2:10 p.m. UTC | #9
Ondra,

I retried and get the patch from http://www.yunos.org/tmp/memset-avx2.patch
it is yunos.org .

Thanks
Ling


2014-05-30 19:30 GMT+08:00, Ondřej Bílka <neleai@seznam.cz>:
> On Fri, May 30, 2014 at 05:02:29PM +0800, Ling Ma wrote:
>> Hi all,
>>
>> Here is latest memset pach: http://www.yunos.org/tmp/memset-avx2.patch
>>
>> When I send patch by git-send-email, libc-alpha@sourceware.org refuse
>> to show it,
>> Sorry for  Inconvenience to you
>>
> It looks its typo in adress, yonos.org does not exist.
>
>
>
  
H.J. Lu June 3, 2014, 11:56 p.m. UTC | #10
On Fri, May 30, 2014 at 7:10 AM, Ling Ma <ling.ma.program@gmail.com> wrote:
> Ondra,
>
> I retried and get the patch from http://www.yunos.org/tmp/memset-avx2.patch
> it is yunos.org .

http://www.yunos.org/tmp/memset-avx2.patch

times out for me.  Can you gzip the patch and send it as
an attachment?

Thanks.

> Thanks
> Ling
>
>
> 2014-05-30 19:30 GMT+08:00, Ondřej Bílka <neleai@seznam.cz>:
>> On Fri, May 30, 2014 at 05:02:29PM +0800, Ling Ma wrote:
>>> Hi all,
>>>
>>> Here is latest memset pach: http://www.yunos.org/tmp/memset-avx2.patch
>>>
>>> When I send patch by git-send-email, libc-alpha@sourceware.org refuse
>>> to show it,
>>> Sorry for  Inconvenience to you
>>>
>> It looks its typo in adress, yonos.org does not exist.
>>
>>
>>
  
ling.ma.program@gmail.com June 4, 2014, 7 a.m. UTC | #11
H.J

The website changed IP, now the code is available again:
 http://www.yunos.org/tmp/memset-avx2.patch ,
and also gziped as attachment in this mail.

Thanks
Ling


2014-06-04 7:56 GMT+08:00, H.J. Lu <hjl.tools@gmail.com>:
> On Fri, May 30, 2014 at 7:10 AM, Ling Ma <ling.ma.program@gmail.com> wrote:
>> Ondra,
>>
>> I retried and get the patch from
>> http://www.yunos.org/tmp/memset-avx2.patch
>> it is yunos.org .
>
> http://www.yunos.org/tmp/memset-avx2.patch
>
> times out for me.  Can you gzip the patch and send it as
> an attachment?
>
> Thanks.
>
>> Thanks
>> Ling
>>
>>
>> 2014-05-30 19:30 GMT+08:00, Ondřej Bílka <neleai@seznam.cz>:
>>> On Fri, May 30, 2014 at 05:02:29PM +0800, Ling Ma wrote:
>>>> Hi all,
>>>>
>>>> Here is latest memset pach: http://www.yunos.org/tmp/memset-avx2.patch
>>>>
>>>> When I send patch by git-send-email, libc-alpha@sourceware.org refuse
>>>> to show it,
>>>> Sorry for  Inconvenience to you
>>>>
>>> It looks its typo in adress, yonos.org does not exist.
>>>
>>>
>>>
>
>
>
> --
> H.J.
>
  
Ondrej Bilka June 5, 2014, 4:32 p.m. UTC | #12
On Wed, Jun 04, 2014 at 03:00:05PM +0800, Ling Ma wrote:
> H.J
> 
> The website changed IP, now the code is available again:
>  http://www.yunos.org/tmp/memset-avx2.patch ,
> and also gziped as attachment in this mail.
> 
> Thanks
> Ling
>
 
Now performance looks ok for me, but few formating problems.
With these fixed I would be satisfied H.J do you have comments?

There is possible followup to also optimize __bzero like we do in
general case.

Then second followup would be decrease function size by reshuffling
blocks, on several places there are 15/16 free bytes due alignment.

Formatting problems are here:

+	vpxor	%xmm0, %xmm0, %xmm0
+	vmovd %esi, %xmm1
+	mov	%rdi, %rsi
+	mov	%rdi, %rax

here

+L(less_16bytes):
+	vmovd %xmm0, %rcx
+	cmp	$8, %dl
+	jb	L(less_8bytes)
+	mov %rcx, (%rdi)
+	mov %rcx, -0x08(%rsi)
+	ret
+
+	.p2align 4
+L(less_8bytes):
+	cmp	$4, %dl
+	jb	L(less_4bytes)
+	mov %ecx, (%rdi)
+	mov %ecx, -0x04(%rsi)
+	ret

and here

+	mov	%rax, %rsi
+	vmovd %xmm0, %eax
+	mov	%rdx, %rcx

As I mentioned code size one trick is that instructions 
with -128 argument are shorter than with 128. You could save 16
bytes with following modification, however it must be tested if 
it improves performance.


--- x	2014-06-05 18:20:35.313645591 +0200
+++ sysdeps/x86_64/multiarch/memset-avx2.S	2014-06-05
18:22:25.068642767 +0200
@@ -95,7 +95,6 @@
 	.p2align 4
 L(256bytesormore):
 	vinserti128 $1, %xmm0, %ymm0, %ymm0
-	mov	$0x80, %rcx
 	add	%rdx, %rsi
 	mov	%rdi, %r9
 	vmovdqu	%ymm0, (%rdi)
@@ -105,15 +104,15 @@
 	add	%r9, %rdx
 	cmp	$4096, %rdx
 	ja	L(gobble_data)
-	sub	%ecx, %edx
+	add	$-128, %edx
 L(gobble_128_loop):
 	vmovdqa	%ymm0, (%rdi)
 	vmovdqa	%ymm0, 0x20(%rdi)
 	vmovdqa	%ymm0, 0x40(%rdi)
 	vmovdqa	%ymm0, 0x60(%rdi)
-	add	%rcx, %rdi
-	sub	%ecx, %edx
-	jae	L(gobble_128_loop)
+	sub	$-128, %rdi
+	add	$-128, %edx
+	jb	L(gobble_128_loop)
 	vmovdqu	%ymm0, -0x80(%rsi)
 	vmovdqu	%ymm0, -0x60(%rsi)
 	vmovdqu	%ymm0, -0x40(%rsi)
  
H.J. Lu June 5, 2014, 5:29 p.m. UTC | #13
On Thu, Jun 5, 2014 at 9:32 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Wed, Jun 04, 2014 at 03:00:05PM +0800, Ling Ma wrote:
>> H.J
>>
>> The website changed IP, now the code is available again:
>>  http://www.yunos.org/tmp/memset-avx2.patch ,
>> and also gziped as attachment in this mail.
>>
>> Thanks
>> Ling
>>
>
> Now performance looks ok for me, but few formating problems.
> With these fixed I would be satisfied H.J do you have comments?

I don't have any additional comments.  Thanks.

> There is possible followup to also optimize __bzero like we do in
> general case.
>
> Then second followup would be decrease function size by reshuffling
> blocks, on several places there are 15/16 free bytes due alignment.
>
> Formatting problems are here:
>
> +       vpxor   %xmm0, %xmm0, %xmm0
> +       vmovd %esi, %xmm1
> +       mov     %rdi, %rsi
> +       mov     %rdi, %rax
>
> here
>
> +L(less_16bytes):
> +       vmovd %xmm0, %rcx
> +       cmp     $8, %dl
> +       jb      L(less_8bytes)
> +       mov %rcx, (%rdi)
> +       mov %rcx, -0x08(%rsi)
> +       ret
> +
> +       .p2align 4
> +L(less_8bytes):
> +       cmp     $4, %dl
> +       jb      L(less_4bytes)
> +       mov %ecx, (%rdi)
> +       mov %ecx, -0x04(%rsi)
> +       ret
>
> and here
>
> +       mov     %rax, %rsi
> +       vmovd %xmm0, %eax
> +       mov     %rdx, %rcx
>
> As I mentioned code size one trick is that instructions
> with -128 argument are shorter than with 128. You could save 16
> bytes with following modification, however it must be tested if
> it improves performance.
>
>
> --- x   2014-06-05 18:20:35.313645591 +0200
> +++ sysdeps/x86_64/multiarch/memset-avx2.S      2014-06-05
> 18:22:25.068642767 +0200
> @@ -95,7 +95,6 @@
>         .p2align 4
>  L(256bytesormore):
>         vinserti128 $1, %xmm0, %ymm0, %ymm0
> -       mov     $0x80, %rcx
>         add     %rdx, %rsi
>         mov     %rdi, %r9
>         vmovdqu %ymm0, (%rdi)
> @@ -105,15 +104,15 @@
>         add     %r9, %rdx
>         cmp     $4096, %rdx
>         ja      L(gobble_data)
> -       sub     %ecx, %edx
> +       add     $-128, %edx
>  L(gobble_128_loop):
>         vmovdqa %ymm0, (%rdi)
>         vmovdqa %ymm0, 0x20(%rdi)
>         vmovdqa %ymm0, 0x40(%rdi)
>         vmovdqa %ymm0, 0x60(%rdi)
> -       add     %rcx, %rdi
> -       sub     %ecx, %edx
> -       jae     L(gobble_128_loop)
> +       sub     $-128, %rdi
> +       add     $-128, %edx
> +       jb      L(gobble_128_loop)
>         vmovdqu %ymm0, -0x80(%rsi)
>         vmovdqu %ymm0, -0x60(%rsi)
>         vmovdqu %ymm0, -0x40(%rsi)
>
  
ling.ma.program@gmail.com June 6, 2014, 3:03 p.m. UTC | #14
I will send the next version after performance test according to your comments.

Thanks
Ling

2014-06-06 0:32 GMT+08:00, Ondřej Bílka <neleai@seznam.cz>:
> On Wed, Jun 04, 2014 at 03:00:05PM +0800, Ling Ma wrote:
>> H.J
>>
>> The website changed IP, now the code is available again:
>>  http://www.yunos.org/tmp/memset-avx2.patch ,
>> and also gziped as attachment in this mail.
>>
>> Thanks
>> Ling
>>
>
> Now performance looks ok for me, but few formating problems.
> With these fixed I would be satisfied H.J do you have comments?
>
> There is possible followup to also optimize __bzero like we do in
> general case.
>
> Then second followup would be decrease function size by reshuffling
> blocks, on several places there are 15/16 free bytes due alignment.
>
> Formatting problems are here:
>
> +	vpxor	%xmm0, %xmm0, %xmm0
> +	vmovd %esi, %xmm1
> +	mov	%rdi, %rsi
> +	mov	%rdi, %rax
>
> here
>
> +L(less_16bytes):
> +	vmovd %xmm0, %rcx
> +	cmp	$8, %dl
> +	jb	L(less_8bytes)
> +	mov %rcx, (%rdi)
> +	mov %rcx, -0x08(%rsi)
> +	ret
> +
> +	.p2align 4
> +L(less_8bytes):
> +	cmp	$4, %dl
> +	jb	L(less_4bytes)
> +	mov %ecx, (%rdi)
> +	mov %ecx, -0x04(%rsi)
> +	ret
>
> and here
>
> +	mov	%rax, %rsi
> +	vmovd %xmm0, %eax
> +	mov	%rdx, %rcx
>
> As I mentioned code size one trick is that instructions
> with -128 argument are shorter than with 128. You could save 16
> bytes with following modification, however it must be tested if
> it improves performance.
>
>
> --- x	2014-06-05 18:20:35.313645591 +0200
> +++ sysdeps/x86_64/multiarch/memset-avx2.S	2014-06-05
> 18:22:25.068642767 +0200
> @@ -95,7 +95,6 @@
>  	.p2align 4
>  L(256bytesormore):
>  	vinserti128 $1, %xmm0, %ymm0, %ymm0
> -	mov	$0x80, %rcx
>  	add	%rdx, %rsi
>  	mov	%rdi, %r9
>  	vmovdqu	%ymm0, (%rdi)
> @@ -105,15 +104,15 @@
>  	add	%r9, %rdx
>  	cmp	$4096, %rdx
>  	ja	L(gobble_data)
> -	sub	%ecx, %edx
> +	add	$-128, %edx
>  L(gobble_128_loop):
>  	vmovdqa	%ymm0, (%rdi)
>  	vmovdqa	%ymm0, 0x20(%rdi)
>  	vmovdqa	%ymm0, 0x40(%rdi)
>  	vmovdqa	%ymm0, 0x60(%rdi)
> -	add	%rcx, %rdi
> -	sub	%ecx, %edx
> -	jae	L(gobble_128_loop)
> +	sub	$-128, %rdi
> +	add	$-128, %edx
> +	jb	L(gobble_128_loop)
>  	vmovdqu	%ymm0, -0x80(%rsi)
>  	vmovdqu	%ymm0, -0x60(%rsi)
>  	vmovdqu	%ymm0, -0x40(%rsi)
>
>
  
ling.ma.program@gmail.com June 10, 2014, 1:52 p.m. UTC | #15
In this patch as gziped attachment, we take advantage of HSW memory
bandwidth, manage to reduce miss branch prediction by avoiding using
branch instructions and
force destination to be aligned with avx & avx2 instruction.

The CPU2006 403.gcc benchmark indicates this patch improves performance
from 26% to 59%.

This version accept Ondra's comments and avoid branch instruction to
cross 16byte-aligned code.

Thanks
Ling
  
H.J. Lu June 18, 2014, 4:47 p.m. UTC | #16
On Tue, Jun 10, 2014 at 6:52 AM, Ling Ma <ling.ma.program@gmail.com> wrote:
> In this patch as gziped attachment, we take advantage of HSW memory
> bandwidth, manage to reduce miss branch prediction by avoiding using
> branch instructions and
> force destination to be aligned with avx & avx2 instruction.
>
> The CPU2006 403.gcc benchmark indicates this patch improves performance
> from 26% to 59%.
>
> This version accept Ondra's comments and avoid branch instruction to
> cross 16byte-aligned code.

Any feedback?  I'd like to check it in before 2.20 code freeze.

Thanks.
  
Ondrej Bilka June 19, 2014, 7:12 p.m. UTC | #17
On Wed, Jun 18, 2014 at 09:47:11AM -0700, H.J. Lu wrote:
> On Tue, Jun 10, 2014 at 6:52 AM, Ling Ma <ling.ma.program@gmail.com> wrote:
> > In this patch as gziped attachment, we take advantage of HSW memory
> > bandwidth, manage to reduce miss branch prediction by avoiding using
> > branch instructions and
> > force destination to be aligned with avx & avx2 instruction.
> >
> > The CPU2006 403.gcc benchmark indicates this patch improves performance
> > from 26% to 59%.
> >
> > This version accept Ondra's comments and avoid branch instruction to
> > cross 16byte-aligned code.
> 
> Any feedback?  I'd like to check it in before 2.20 code freeze.
> 
As I said before its ok with fixed formatting, you could commit it if
you wish.
  

Patch

diff --git a/ChangeLog b/ChangeLog
index ab23a3a..851fe9e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@ 
+2014-04-04  Ling Ma  <ling.ml@alibaba-inc.com>
+
+	* sysdeps/x86_64/multiarch/Makefile: Add memset-avx2
+	* sysdeps/x86_64/multiarch/memset-avx2.S: New file for AVX2 memset
+	* sysdeps/x86_64/multiarch/memset.S: New file for multiple memset
+	versions
+	* sysdeps/x86_64/multiarch/memset_chk.S: New file for multiple memset_chk
+	versions
+
 2014-04-04  Sihai Yao  <sihai.ysh@alibaba-inc.com>
 
 	* sysdeps/x86_64/multiarch/ifunc-defines.sym: Add COMMON_CPU_INDEX_7 and
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 57a3c13..42df96f 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -17,7 +17,9 @@  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
-		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned
+		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
+		   memset-avx2
+
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c varshift
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
new file mode 100644
index 0000000..5d4a487
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2.S
@@ -0,0 +1,192 @@ 
+/* memset with AVX2
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   Contributed by Alibaba Group.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc
+
+#include "asm-syntax.h"
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+#ifndef MEMSET
+# define MEMSET	__memset_avx2
+# define MEMSET_CHK	__memset_chk_avx2
+#endif
+
+	.section .text.avx2,"ax",@progbits
+#if defined PIC
+ENTRY (MEMSET_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMSET_CHK)
+#endif
+
+ENTRY (MEMSET)
+	vpxor	%xmm0, %xmm0, %xmm0
+	vmovd %esi, %xmm1
+	lea	(%rdi, %rdx), %r8
+	vpshufb	%xmm0, %xmm1, %xmm0
+	mov	%rdi, %rax
+	cmp	$256, %rdx
+	jae	L(256bytesormore)
+	vmovd %xmm0, %rcx
+	cmp	$128, %rdx
+	jb	L(less_128bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, 0x10(%rdi)
+	vmovups %xmm0, 0x20(%rdi)
+	vmovups %xmm0, 0x30(%rdi)
+	vmovups %xmm0, 0x40(%rdi)
+	vmovups %xmm0, 0x50(%rdi)
+	vmovups %xmm0, 0x60(%rdi)
+	vmovups %xmm0, 0x70(%rdi)
+	vmovups %xmm0, -0x80(%r8)
+	vmovups %xmm0, -0x70(%r8)
+	vmovups %xmm0, -0x60(%r8)
+	vmovups %xmm0, -0x50(%r8)
+	vmovups %xmm0, -0x40(%r8)
+	vmovups %xmm0, -0x30(%r8)
+	vmovups %xmm0, -0x20(%r8)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(less_128bytes):
+	cmp	$64, %edx
+	jb	L(less_64bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, 0x10(%rdi)
+	vmovups %xmm0, 0x20(%rdi)
+	vmovups %xmm0, 0x30(%rdi)
+	vmovups %xmm0, -0x40(%r8)
+	vmovups %xmm0, -0x30(%r8)
+	vmovups %xmm0, -0x20(%r8)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(less_64bytes):
+	cmp	$32, %edx
+	jb	L(less_32bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, 0x10(%rdi)
+	vmovups %xmm0, -0x20(%r8)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(less_32bytes):
+	cmp	$16, %edx
+	jb	L(less_16bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(less_16bytes):
+	cmp	$8, %edx
+	jb	L(less_8bytes)
+	mov %rcx, (%rdi)
+	mov %rcx, -0x08(%r8)
+	ret
+	ALIGN(4)
+L(less_8bytes):
+	cmp	$4, %edx
+	jb	L(less_4bytes)
+	mov %ecx, (%rdi)
+	mov %ecx, -0x04(%r8)
+	ALIGN(4)
+L(less_4bytes):
+	cmp	$2, %edx
+	jb	L(less_2bytes)
+	mov	%cx, (%rdi)
+	mov	%cx, -0x02(%r8)
+	ret
+	ALIGN(4)
+L(less_2bytes):
+	cmp	$1, %edx
+	jb	L(less_1bytes)
+	mov	%cl, (%rdi)
+L(less_1bytes):
+	ret
+
+	ALIGN(4)
+L(256bytesormore):
+	vinserti128 $1, %xmm0, %ymm0, %ymm0
+	vmovups	%ymm0, (%rdi)
+	mov	%rdi, %r9
+	and	$-0x20, %rdi
+	add	$32, %rdi
+	sub	%rdi, %r9
+	add	%r9, %rdx
+	cmp	$4096, %rdx
+	ja	L(gobble_data)
+
+	sub	$0x80, %rdx
+L(gobble_128_loop):
+	vmovaps	%ymm0, (%rdi)
+	vmovaps	%ymm0, 0x20(%rdi)
+	vmovaps	%ymm0, 0x40(%rdi)
+	vmovaps	%ymm0, 0x60(%rdi)
+	lea	0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_128_loop)
+	vmovups	%ymm0, -0x80(%r8)
+	vmovups	%ymm0, -0x60(%r8)
+	vmovups	%ymm0, -0x40(%r8)
+	vmovups	%ymm0, -0x20(%r8)
+	vzeroupper
+	ret
+
+	ALIGN(4)
+L(gobble_data):
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %r9
+#else
+	mov	__x86_shared_cache_size_half(%rip), %r9
+#endif
+	shl	$4, %r9
+	cmp	%r9, %rdx
+	ja	L(gobble_big_data)
+	mov	%rax, %r9
+	mov	%esi, %eax
+	mov	%rdx, %rcx
+	rep	stosb
+	mov	%r9, %rax
+	vzeroupper
+	ret
+
+	ALIGN(4)
+L(gobble_big_data):
+	sub	$0x80, %rdx
+L(gobble_big_data_loop):
+	vmovntdq	%ymm0, (%rdi)
+	vmovntdq	%ymm0, 0x20(%rdi)
+	vmovntdq	%ymm0, 0x40(%rdi)
+	vmovntdq	%ymm0, 0x60(%rdi)
+	lea	0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_big_data_loop)
+	vmovups	%ymm0, -0x80(%r8)
+	vmovups	%ymm0, -0x60(%r8)
+	vmovups	%ymm0, -0x40(%r8)
+	vmovups	%ymm0, -0x20(%r8)
+	vzeroupper
+	sfence
+	ret
+
+END (MEMSET)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
new file mode 100644
index 0000000..df903af
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -0,0 +1,59 @@ 
+/* Multiple versions of memset
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   Contributed by Alibaba Group.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#ifndef NOT_IN_libc
+ENTRY(memset)
+	.type	memset, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__memset_sse2(%rip), %rax
+	testl	$bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	jz	2f
+	leaq	__memset_avx2(%rip), %rax
+2:	ret
+END(memset)
+#endif
+
+#if !defined NOT_IN_libc
+# undef memset
+# define memset __memset_sse2
+
+# undef __memset_chk
+# define __memset_chk __memset_chk_sse2
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memset calls through a PLT.
+   The speedup we get from using GPR instruction is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memset; __GI_memset = __memset_sse2
+# endif
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
new file mode 100644
index 0000000..f048dac
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset_chk.S
@@ -0,0 +1,44 @@ 
+/* Multiple versions of memset_chk
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   Contributed by Alibaba Group.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+ENTRY(__memset_chk)
+	.type	__memset_chk, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__memset_chk_sse2(%rip), %rax
+	testl	$bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	jz	2f
+	leaq	__memset_chk_avx2(%rip), %rax
+2:	ret
+END(__memset_chk)
+
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+	.section .gnu.warning.__memset_zero_constant_len_parameter
+	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+# else
+#  include "../memset_chk.S"
+# endif
+#endif