x86-64: Add wmemset optimized with SSE2/AVX2/AVX512
Commit Message
The difference between memset and wmemset is byte vs int. Add stubs
to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size:
SSE2 wmemset:
shl $0x2,%rdx
movd %esi,%xmm0
mov %rdi,%rax
pshufd $0x0,%xmm0,%xmm0
jmp entry_from_wmemset
SSE2 memset:
movd %esi,%xmm0
mov %rdi,%rax
punpcklbw %xmm0,%xmm0
punpcklwd %xmm0,%xmm0
pshufd $0x0,%xmm0,%xmm0
entry_from_wmemset:
Since the ERMS versions of wmemset requires "rep stosl" instead of
"rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset
are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset
is about 6X faster on Haswell.
OK for master?
H.J.
---
* include/wchar.h (__wmemset_chk): New.
* sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed
to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
(WMEMSET_CHK_SYMBOL): Likewise.
(WMEMSET_SYMBOL): Likewise.
(__wmemset): Add hidden definition.
(wmemset): Add weak hidden definition.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned,
__wmemset_avx2_unaligned, __wmemset_avx512_unaligned,
__wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned
and __wmemset_chk_avx512_unaligned.
* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
(WMEMSET_SYMBOL): Likewise.
* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
(WMEMSET_SYMBOL): Likewise.
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated.
(WMEMSET_CHK_SYMBOL): New.
(WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise.
(WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise.
* sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New.
(libc_hidden_builtin_def): Also define __GI_wmemset and
__GI___wmemset.
(weak_alias): New.
* sysdeps/x86_64/multiarch/wmemset.S: New file.
* sysdeps/x86_64/multiarch/wmemset_chk.S: Likewise.
* sysdeps/x86_64/wmemset.S: Likewise.
* sysdeps/x86_64/wmemset_chk.S: Likewise.
---
include/wchar.h | 3 ++
sysdeps/x86_64/memset.S | 18 ++++++++-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 ++++++++++
.../x86_64/multiarch/memset-avx2-unaligned-erms.S | 8 +++-
.../multiarch/memset-avx512-unaligned-erms.S | 9 ++++-
.../x86_64/multiarch/memset-vec-unaligned-erms.S | 24 +++++++++--
sysdeps/x86_64/multiarch/memset.S | 13 ++++--
sysdeps/x86_64/multiarch/wmemset.S | 47 ++++++++++++++++++++++
sysdeps/x86_64/multiarch/wmemset_chk.S | 46 +++++++++++++++++++++
sysdeps/x86_64/wmemset.S | 1 +
sysdeps/x86_64/wmemset_chk.S | 33 +++++++++++++++
11 files changed, 215 insertions(+), 9 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/wmemset.S
create mode 100644 sysdeps/x86_64/multiarch/wmemset_chk.S
create mode 100644 sysdeps/x86_64/wmemset.S
create mode 100644 sysdeps/x86_64/wmemset_chk.S
Comments
On Sun, May 21, 2017 at 1:34 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> The difference between memset and wmemset is byte vs int. Add stubs
> to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size:
>
> SSE2 wmemset:
> shl $0x2,%rdx
> movd %esi,%xmm0
> mov %rdi,%rax
> pshufd $0x0,%xmm0,%xmm0
> jmp entry_from_wmemset
>
> SSE2 memset:
> movd %esi,%xmm0
> mov %rdi,%rax
> punpcklbw %xmm0,%xmm0
> punpcklwd %xmm0,%xmm0
> pshufd $0x0,%xmm0,%xmm0
> entry_from_wmemset:
>
> Since the ERMS versions of wmemset requires "rep stosl" instead of
> "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset
> are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset
> is about 6X faster on Haswell.
>
> OK for master?
Any objections?
> H.J.
> ---
> * include/wchar.h (__wmemset_chk): New.
> * sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed
> to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
> (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
> (WMEMSET_CHK_SYMBOL): Likewise.
> (WMEMSET_SYMBOL): Likewise.
> (__wmemset): Add hidden definition.
> (wmemset): Add weak hidden definition.
> * sysdeps/x86_64/multiarch/ifunc-impl-list.c
> (__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned,
> __wmemset_avx2_unaligned, __wmemset_avx512_unaligned,
> __wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned
> and __wmemset_chk_avx512_unaligned.
> * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
> (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
> (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
> (WMEMSET_SYMBOL): Likewise.
> * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
> (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
> (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
> (WMEMSET_SYMBOL): Likewise.
> * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated.
> (WMEMSET_CHK_SYMBOL): New.
> (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise.
> (WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise.
> * sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New.
> (libc_hidden_builtin_def): Also define __GI_wmemset and
> __GI___wmemset.
> (weak_alias): New.
> * sysdeps/x86_64/multiarch/wmemset.S: New file.
> * sysdeps/x86_64/multiarch/wmemset_chk.S: Likewise.
> * sysdeps/x86_64/wmemset.S: Likewise.
> * sysdeps/x86_64/wmemset_chk.S: Likewise.
> ---
> include/wchar.h | 3 ++
> sysdeps/x86_64/memset.S | 18 ++++++++-
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 ++++++++++
> .../x86_64/multiarch/memset-avx2-unaligned-erms.S | 8 +++-
> .../multiarch/memset-avx512-unaligned-erms.S | 9 ++++-
> .../x86_64/multiarch/memset-vec-unaligned-erms.S | 24 +++++++++--
> sysdeps/x86_64/multiarch/memset.S | 13 ++++--
> sysdeps/x86_64/multiarch/wmemset.S | 47 ++++++++++++++++++++++
> sysdeps/x86_64/multiarch/wmemset_chk.S | 46 +++++++++++++++++++++
> sysdeps/x86_64/wmemset.S | 1 +
> sysdeps/x86_64/wmemset_chk.S | 33 +++++++++++++++
> 11 files changed, 215 insertions(+), 9 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/wmemset.S
> create mode 100644 sysdeps/x86_64/multiarch/wmemset_chk.S
> create mode 100644 sysdeps/x86_64/wmemset.S
> create mode 100644 sysdeps/x86_64/wmemset_chk.S
>
> diff --git a/include/wchar.h b/include/wchar.h
> index e2579a1..a773d56 100644
> --- a/include/wchar.h
> +++ b/include/wchar.h
> @@ -157,6 +157,9 @@ extern wchar_t *__wmemmove (wchar_t *__s1, const wchar_t *__s2,
> extern wchar_t *__wcschrnul (const wchar_t *__s, wchar_t __wc)
> __attribute_pure__;
>
> +extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
> + size_t __ns) __THROW;
> +
> extern int __vfwscanf (__FILE *__restrict __s,
> const wchar_t *__restrict __format,
> __gnuc_va_list __arg)
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index 69ed509..4127878 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -26,13 +26,18 @@
> #define VMOVU movdqu
> #define VMOVA movdqa
>
> -#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> movd d, %xmm0; \
> movq r, %rax; \
> punpcklbw %xmm0, %xmm0; \
> punpcklwd %xmm0, %xmm0; \
> pshufd $0, %xmm0, %xmm0
>
> +#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> + movd d, %xmm0; \
> + movq r, %rax; \
> + pshufd $0, %xmm0, %xmm0
> +
> #define SECTION(p) p
>
> #ifndef MEMSET_SYMBOL
> @@ -40,10 +45,21 @@
> # define MEMSET_SYMBOL(p,s) memset
> #endif
>
> +#ifndef WMEMSET_SYMBOL
> +# define WMEMSET_CHK_SYMBOL(p,s) p
> +# define WMEMSET_SYMBOL(p,s) __wmemset
> +#endif
> +
> #include "multiarch/memset-vec-unaligned-erms.S"
>
> libc_hidden_builtin_def (memset)
>
> +#if IS_IN (libc)
> +libc_hidden_def (__wmemset)
> +weak_alias (__wmemset, wmemset)
> +libc_hidden_weak (wmemset)
> +#endif
> +
> #if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
> strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
> .section .gnu.warning.__memset_zero_constant_len_parameter
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 06d9a9d..a91d2f9 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -300,6 +300,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __wmemcmp_ssse3)
> IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
>
> + /* Support sysdeps/x86_64/multiarch/wmemset.S. */
> + IFUNC_IMPL (i, name, wmemset,
> + IFUNC_IMPL_ADD (array, i, wmemset, 1,
> + __wmemset_sse2_unaligned)
> + IFUNC_IMPL_ADD (array, i, wmemset,
> + HAS_ARCH_FEATURE (AVX2_Usable),
> + __wmemset_avx2_unaligned)
> + IFUNC_IMPL_ADD (array, i, wmemset,
> + HAS_ARCH_FEATURE (AVX512F_Usable),
> + __wmemset_avx512_unaligned))
> +
> #ifdef SHARED
> /* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */
> IFUNC_IMPL (i, name, __memcpy_chk,
> @@ -417,6 +428,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
> __strncmp_ssse3)
> IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
> +
> + /* Support sysdeps/x86_64/multiarch/wmemset_chk.S. */
> + IFUNC_IMPL (i, name, __wmemset_chk,
> + IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
> + __wmemset_chk_sse2_unaligned)
> + IFUNC_IMPL_ADD (array, i, __wmemset_chk,
> + HAS_ARCH_FEATURE (AVX2_Usable),
> + __wmemset_chk_avx2_unaligned)
> + IFUNC_IMPL_ADD (array, i, __wmemset_chk,
> + HAS_ARCH_FEATURE (AVX512F_Usable),
> + __wmemset_chk_avx512_unaligned))
> #endif
>
> return i;
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index 79975e0..7ab3d89 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -4,13 +4,19 @@
> # define VMOVU vmovdqu
> # define VMOVA vmovdqa
>
> -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> vmovd d, %xmm0; \
> movq r, %rax; \
> vpbroadcastb %xmm0, %ymm0
>
> +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> + vmovd d, %xmm0; \
> + movq r, %rax; \
> + vpbroadcastd %xmm0, %ymm0
> +
> # define SECTION(p) p##.avx
> # define MEMSET_SYMBOL(p,s) p##_avx2_##s
> +# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
>
> # include "memset-vec-unaligned-erms.S"
> #endif
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index a5ec349..0783979 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -4,14 +4,21 @@
> # define VMOVU vmovdqu64
> # define VMOVA vmovdqa64
>
> -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> vmovd d, %xmm0; \
> movq r, %rax; \
> vpbroadcastb %xmm0, %xmm0; \
> vpbroadcastq %xmm0, %zmm0
>
> +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> + vmovd d, %xmm0; \
> + movq r, %rax; \
> + vpbroadcastd %xmm0, %xmm0; \
> + vpbroadcastq %xmm0, %zmm0
> +
> # define SECTION(p) p##.avx512
> # define MEMSET_SYMBOL(p,s) p##_avx512_##s
> +# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
>
> # include "memset-vec-unaligned-erms.S"
> #endif
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 704eed9..2eb9e37 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -30,6 +30,10 @@
> # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
> #endif
>
> +#ifndef WMEMSET_CHK_SYMBOL
> +# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
> +#endif
> +
> #ifndef VZEROUPPER
> # if VEC_SIZE > 16
> # define VZEROUPPER vzeroupper
> @@ -79,6 +83,21 @@ END (__bzero)
> weak_alias (__bzero, bzero)
> #endif
>
> +#if IS_IN (libc)
> +# if defined SHARED
> +ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
> + cmpq %rdx, %rcx
> + jb HIDDEN_JUMPTARGET (__chk_fail)
> +END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
> +# endif
> +
> +ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> + shlq $2, %rdx
> + WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> + jmp L(entry_from_bzero)
> +END (WMEMSET_SYMBOL (__wmemset, unaligned))
> +#endif
> +
> #if defined SHARED && IS_IN (libc)
> ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> cmpq %rdx, %rcx
> @@ -87,8 +106,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> #endif
>
> ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> -L(memset_entry):
> - VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> L(entry_from_bzero):
> cmpq $VEC_SIZE, %rdx
> jb L(less_vec)
> @@ -132,7 +150,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> # endif
>
> ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
> - VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> cmpq $VEC_SIZE, %rdx
> jb L(less_vec)
> cmpq $(VEC_SIZE * 2), %rdx
> diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
> index 9d33118..11f2737 100644
> --- a/sysdeps/x86_64/multiarch/memset.S
> +++ b/sysdeps/x86_64/multiarch/memset.S
> @@ -58,16 +58,23 @@ END(memset)
>
> #if IS_IN (libc)
> # define MEMSET_SYMBOL(p,s) p##_sse2_##s
> +# define WMEMSET_SYMBOL(p,s) p##_sse2_##s
>
> # ifdef SHARED
> -# undef libc_hidden_builtin_def
> +# undef libc_hidden_builtin_def
> /* It doesn't make sense to send libc-internal memset calls through a PLT.
> The speedup we get from using SSE2 instructions is likely eaten away
> by the indirect call in the PLT. */
> -# define libc_hidden_builtin_def(name) \
> - .globl __GI_memset; __GI_memset = __memset_sse2_unaligned
> +# define libc_hidden_builtin_def(name) \
> + .globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
> + .globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
> + .globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
> # endif
>
> +# undef weak_alias
> +# define weak_alias(original, alias) \
> + .weak bzero; bzero = __bzero
> +
> # undef strong_alias
> # define strong_alias(original, alias)
> #endif
> diff --git a/sysdeps/x86_64/multiarch/wmemset.S b/sysdeps/x86_64/multiarch/wmemset.S
> new file mode 100644
> index 0000000..3bd7ca2
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wmemset.S
> @@ -0,0 +1,47 @@
> +/* Multiple versions of wmemset
> + All versions must be listed in ifunc-impl-list.c.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +/* Define multiple versions only for the definition in libc. */
> +#if IS_IN (libc)
> +
> +#include <sysdep.h>
> +#include <shlib-compat.h>
> +#include <init-arch.h>
> +
> +ENTRY(__wmemset)
> + .type __wmemset, @gnu_indirect_function
> + LOAD_RTLD_GLOBAL_RO_RDX
> + lea __wmemset_sse2_unaligned(%rip), %RAX_LP
> + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
> + jnz 1f
> + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
> + jz 1f
> + HAS_ARCH_FEATURE (AVX2_Usable)
> + jz 1f
> + lea __wmemset_avx2_unaligned(%rip), %RAX_LP
> + HAS_ARCH_FEATURE (Prefer_No_AVX512)
> + jnz 1f
> + HAS_ARCH_FEATURE (AVX512F_Usable)
> + jz 1f
> + lea __wmemset_avx512_unaligned(%rip), %RAX_LP
> +1: ret
> +END(__wmemset)
> +
> +weak_alias (__wmemset, wmemset)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.S b/sysdeps/x86_64/multiarch/wmemset_chk.S
> new file mode 100644
> index 0000000..c76fcb1
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wmemset_chk.S
> @@ -0,0 +1,46 @@
> +/* Multiple versions of wmemset_chk
> + All versions must be listed in ifunc-impl-list.c.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include <shlib-compat.h>
> +#include <init-arch.h>
> +
> +/* Define multiple versions only for the definition in libc. */
> +#if IS_IN (libc)
> +# ifdef SHARED
> +ENTRY(__wmemset_chk)
> + .type __wmemset_chk, @gnu_indirect_function
> + LOAD_RTLD_GLOBAL_RO_RDX
> + lea __wmemset_chk_sse2_unaligned(%rip), %RAX_LP
> + HAS_ARCH_FEATURE (AVX2_Usable)
> + jz 1f
> + lea __wmemset_chk_avx2_unaligned(%rip), %RAX_LP
> + HAS_ARCH_FEATURE (Prefer_No_AVX512)
> + jnz 1f
> + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
> + jnz 1f
> + HAS_ARCH_FEATURE (AVX512F_Usable)
> + jz 1f
> + lea __wmemset_chk_avx512_unaligned(%rip), %RAX_LP
> +1: ret
> +END(__wmemset_chk)
> +# else
> +# include "../wmemset_chk.S"
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/wmemset.S b/sysdeps/x86_64/wmemset.S
> new file mode 100644
> index 0000000..f96d567
> --- /dev/null
> +++ b/sysdeps/x86_64/wmemset.S
> @@ -0,0 +1 @@
> +/* Implemented in memset.S. */
> diff --git a/sysdeps/x86_64/wmemset_chk.S b/sysdeps/x86_64/wmemset_chk.S
> new file mode 100644
> index 0000000..64c2774
> --- /dev/null
> +++ b/sysdeps/x86_64/wmemset_chk.S
> @@ -0,0 +1,33 @@
> +/* Checking wmemset for x86-64.
> + Copyright (C) 2004-2017 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "asm-syntax.h"
> +
> +#ifndef SHARED
> + /* For libc.so this is defined in wmemset.S.
> + For libc.a, this is a separate source to avoid
> + wmemset bringing in __chk_fail and all routines
> + it calls. */
> + .text
> +ENTRY (__wmemset_chk)
> + cmpq %rdx, %rcx
> + jb __chk_fail
> + jmp wmemset
> +END (__wmemset_chk)
> +#endif
> --
> 2.9.4
>
+1
-----Original Message-----
From: <libc-alpha-owner@sourceware.org> on behalf of "H.J. Lu"
<hjl.tools@gmail.com>
Date: Tuesday, May 30, 2017 at 6:41 PM
To: GNU C Library <libc-alpha@sourceware.org>
Subject: Re: [PATCH] x86-64: Add wmemset optimized with SSE2/AVX2/AVX512
>On Sun, May 21, 2017 at 1:34 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> The difference between memset and wmemset is byte vs int. Add stubs
>> to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size:
>>
>> SSE2 wmemset:
>> shl $0x2,%rdx
>> movd %esi,%xmm0
>> mov %rdi,%rax
>> pshufd $0x0,%xmm0,%xmm0
>> jmp entry_from_wmemset
>>
>> SSE2 memset:
>> movd %esi,%xmm0
>> mov %rdi,%rax
>> punpcklbw %xmm0,%xmm0
>> punpcklwd %xmm0,%xmm0
>> pshufd $0x0,%xmm0,%xmm0
>> entry_from_wmemset:
>>
>> Since the ERMS versions of wmemset requires "rep stosl" instead of
>> "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset
>> are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset
>> is about 6X faster on Haswell.
>>
>> OK for master?
>
>Any objections?
>
>> H.J.
>> ---
>> * include/wchar.h (__wmemset_chk): New.
>> * sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed
>> to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
>> (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
>> (WMEMSET_CHK_SYMBOL): Likewise.
>> (WMEMSET_SYMBOL): Likewise.
>> (__wmemset): Add hidden definition.
>> (wmemset): Add weak hidden definition.
>> * sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> (__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned,
>> __wmemset_avx2_unaligned, __wmemset_avx512_unaligned,
>> __wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned
>> and __wmemset_chk_avx512_unaligned.
>> * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
>> (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
>> (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
>> (WMEMSET_SYMBOL): Likewise.
>> * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
>> (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
>> (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
>> (WMEMSET_SYMBOL): Likewise.
>> * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated.
>> (WMEMSET_CHK_SYMBOL): New.
>> (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise.
>> (WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise.
>> * sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New.
>> (libc_hidden_builtin_def): Also define __GI_wmemset and
>> __GI___wmemset.
>> (weak_alias): New.
>> * sysdeps/x86_64/multiarch/wmemset.S: New file.
>> * sysdeps/x86_64/multiarch/wmemset_chk.S: Likewise.
>> * sysdeps/x86_64/wmemset.S: Likewise.
>> * sysdeps/x86_64/wmemset_chk.S: Likewise.
>> ---
>> include/wchar.h | 3 ++
>> sysdeps/x86_64/memset.S | 18 ++++++++-
>> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 ++++++++++
>> .../x86_64/multiarch/memset-avx2-unaligned-erms.S | 8 +++-
>> .../multiarch/memset-avx512-unaligned-erms.S | 9 ++++-
>> .../x86_64/multiarch/memset-vec-unaligned-erms.S | 24 +++++++++--
>> sysdeps/x86_64/multiarch/memset.S | 13 ++++--
>> sysdeps/x86_64/multiarch/wmemset.S | 47
>>++++++++++++++++++++++
>> sysdeps/x86_64/multiarch/wmemset_chk.S | 46
>>+++++++++++++++++++++
>> sysdeps/x86_64/wmemset.S | 1 +
>> sysdeps/x86_64/wmemset_chk.S | 33 +++++++++++++++
>> 11 files changed, 215 insertions(+), 9 deletions(-)
>> create mode 100644 sysdeps/x86_64/multiarch/wmemset.S
>> create mode 100644 sysdeps/x86_64/multiarch/wmemset_chk.S
>> create mode 100644 sysdeps/x86_64/wmemset.S
>> create mode 100644 sysdeps/x86_64/wmemset_chk.S
>>
>> diff --git a/include/wchar.h b/include/wchar.h
>> index e2579a1..a773d56 100644
>> --- a/include/wchar.h
>> +++ b/include/wchar.h
>> @@ -157,6 +157,9 @@ extern wchar_t *__wmemmove (wchar_t *__s1, const
>>wchar_t *__s2,
>> extern wchar_t *__wcschrnul (const wchar_t *__s, wchar_t __wc)
>> __attribute_pure__;
>>
>> +extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
>> + size_t __ns) __THROW;
>> +
>> extern int __vfwscanf (__FILE *__restrict __s,
>> const wchar_t *__restrict __format,
>> __gnuc_va_list __arg)
>> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
>> index 69ed509..4127878 100644
>> --- a/sysdeps/x86_64/memset.S
>> +++ b/sysdeps/x86_64/memset.S
>> @@ -26,13 +26,18 @@
>> #define VMOVU movdqu
>> #define VMOVA movdqa
>>
>> -#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> +#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> movd d, %xmm0; \
>> movq r, %rax; \
>> punpcklbw %xmm0, %xmm0; \
>> punpcklwd %xmm0, %xmm0; \
>> pshufd $0, %xmm0, %xmm0
>>
>> +#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> + movd d, %xmm0; \
>> + movq r, %rax; \
>> + pshufd $0, %xmm0, %xmm0
>> +
>> #define SECTION(p) p
>>
>> #ifndef MEMSET_SYMBOL
>> @@ -40,10 +45,21 @@
>> # define MEMSET_SYMBOL(p,s) memset
>> #endif
>>
>> +#ifndef WMEMSET_SYMBOL
>> +# define WMEMSET_CHK_SYMBOL(p,s) p
>> +# define WMEMSET_SYMBOL(p,s) __wmemset
>> +#endif
>> +
>> #include "multiarch/memset-vec-unaligned-erms.S"
>>
>> libc_hidden_builtin_def (memset)
>>
>> +#if IS_IN (libc)
>> +libc_hidden_def (__wmemset)
>> +weak_alias (__wmemset, wmemset)
>> +libc_hidden_weak (wmemset)
>> +#endif
>> +
>> #if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
>> strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
>> .section .gnu.warning.__memset_zero_constant_len_parameter
>> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>>b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> index 06d9a9d..a91d2f9 100644
>> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> @@ -300,6 +300,17 @@ __libc_ifunc_impl_list (const char *name, struct
>>libc_ifunc_impl *array,
>> __wmemcmp_ssse3)
>> IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
>>
>> + /* Support sysdeps/x86_64/multiarch/wmemset.S. */
>> + IFUNC_IMPL (i, name, wmemset,
>> + IFUNC_IMPL_ADD (array, i, wmemset, 1,
>> + __wmemset_sse2_unaligned)
>> + IFUNC_IMPL_ADD (array, i, wmemset,
>> + HAS_ARCH_FEATURE (AVX2_Usable),
>> + __wmemset_avx2_unaligned)
>> + IFUNC_IMPL_ADD (array, i, wmemset,
>> + HAS_ARCH_FEATURE (AVX512F_Usable),
>> + __wmemset_avx512_unaligned))
>> +
>> #ifdef SHARED
>> /* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */
>> IFUNC_IMPL (i, name, __memcpy_chk,
>> @@ -417,6 +428,17 @@ __libc_ifunc_impl_list (const char *name, struct
>>libc_ifunc_impl *array,
>> IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE
>>(SSSE3),
>> __strncmp_ssse3)
>> IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
>> +
>> + /* Support sysdeps/x86_64/multiarch/wmemset_chk.S. */
>> + IFUNC_IMPL (i, name, __wmemset_chk,
>> + IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
>> + __wmemset_chk_sse2_unaligned)
>> + IFUNC_IMPL_ADD (array, i, __wmemset_chk,
>> + HAS_ARCH_FEATURE (AVX2_Usable),
>> + __wmemset_chk_avx2_unaligned)
>> + IFUNC_IMPL_ADD (array, i, __wmemset_chk,
>> + HAS_ARCH_FEATURE (AVX512F_Usable),
>> + __wmemset_chk_avx512_unaligned))
>> #endif
>>
>> return i;
>> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>>b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> index 79975e0..7ab3d89 100644
>> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> @@ -4,13 +4,19 @@
>> # define VMOVU vmovdqu
>> # define VMOVA vmovdqa
>>
>> -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> vmovd d, %xmm0; \
>> movq r, %rax; \
>> vpbroadcastb %xmm0, %ymm0
>>
>> +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> + vmovd d, %xmm0; \
>> + movq r, %rax; \
>> + vpbroadcastd %xmm0, %ymm0
>> +
>> # define SECTION(p) p##.avx
>> # define MEMSET_SYMBOL(p,s) p##_avx2_##s
>> +# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
>>
>> # include "memset-vec-unaligned-erms.S"
>> #endif
>> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>>b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> index a5ec349..0783979 100644
>> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> @@ -4,14 +4,21 @@
>> # define VMOVU vmovdqu64
>> # define VMOVA vmovdqa64
>>
>> -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> vmovd d, %xmm0; \
>> movq r, %rax; \
>> vpbroadcastb %xmm0, %xmm0; \
>> vpbroadcastq %xmm0, %zmm0
>>
>> +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> + vmovd d, %xmm0; \
>> + movq r, %rax; \
>> + vpbroadcastd %xmm0, %xmm0; \
>> + vpbroadcastq %xmm0, %zmm0
>> +
>> # define SECTION(p) p##.avx512
>> # define MEMSET_SYMBOL(p,s) p##_avx512_##s
>> +# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
>>
>> # include "memset-vec-unaligned-erms.S"
>> #endif
>> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>>b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> index 704eed9..2eb9e37 100644
>> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> @@ -30,6 +30,10 @@
>> # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
>> #endif
>>
>> +#ifndef WMEMSET_CHK_SYMBOL
>> +# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
>> +#endif
>> +
>> #ifndef VZEROUPPER
>> # if VEC_SIZE > 16
>> # define VZEROUPPER vzeroupper
>> @@ -79,6 +83,21 @@ END (__bzero)
>> weak_alias (__bzero, bzero)
>> #endif
>>
>> +#if IS_IN (libc)
>> +# if defined SHARED
>> +ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
>> + cmpq %rdx, %rcx
>> + jb HIDDEN_JUMPTARGET (__chk_fail)
>> +END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
>> +# endif
>> +
>> +ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
>> + shlq $2, %rdx
>> + WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>> + jmp L(entry_from_bzero)
>> +END (WMEMSET_SYMBOL (__wmemset, unaligned))
>> +#endif
>> +
>> #if defined SHARED && IS_IN (libc)
>> ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>> cmpq %rdx, %rcx
>> @@ -87,8 +106,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>> #endif
>>
>> ENTRY (MEMSET_SYMBOL (__memset, unaligned))
>> -L(memset_entry):
>> - VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>> + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>> L(entry_from_bzero):
>> cmpq $VEC_SIZE, %rdx
>> jb L(less_vec)
>> @@ -132,7 +150,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk,
>>unaligned_erms))
>> # endif
>>
>> ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
>> - VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>> + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>> cmpq $VEC_SIZE, %rdx
>> jb L(less_vec)
>> cmpq $(VEC_SIZE * 2), %rdx
>> diff --git a/sysdeps/x86_64/multiarch/memset.S
>>b/sysdeps/x86_64/multiarch/memset.S
>> index 9d33118..11f2737 100644
>> --- a/sysdeps/x86_64/multiarch/memset.S
>> +++ b/sysdeps/x86_64/multiarch/memset.S
>> @@ -58,16 +58,23 @@ END(memset)
>>
>> #if IS_IN (libc)
>> # define MEMSET_SYMBOL(p,s) p##_sse2_##s
>> +# define WMEMSET_SYMBOL(p,s) p##_sse2_##s
>>
>> # ifdef SHARED
>> -# undef libc_hidden_builtin_def
>> +# undef libc_hidden_builtin_def
>> /* It doesn't make sense to send libc-internal memset calls through a
>>PLT.
>> The speedup we get from using SSE2 instructions is likely eaten away
>> by the indirect call in the PLT. */
>> -# define libc_hidden_builtin_def(name) \
>> - .globl __GI_memset; __GI_memset = __memset_sse2_unaligned
>> +# define libc_hidden_builtin_def(name) \
>> + .globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
>> + .globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
>> + .globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
>> # endif
>>
>> +# undef weak_alias
>> +# define weak_alias(original, alias) \
>> + .weak bzero; bzero = __bzero
>> +
>> # undef strong_alias
>> # define strong_alias(original, alias)
>> #endif
>> diff --git a/sysdeps/x86_64/multiarch/wmemset.S
>>b/sysdeps/x86_64/multiarch/wmemset.S
>> new file mode 100644
>> index 0000000..3bd7ca2
>> --- /dev/null
>> +++ b/sysdeps/x86_64/multiarch/wmemset.S
>> @@ -0,0 +1,47 @@
>> +/* Multiple versions of wmemset
>> + All versions must be listed in ifunc-impl-list.c.
>> + Copyright (C) 2017 Free Software Foundation, Inc.
>> + This file is part of the GNU C Library.
>> +
>> + The GNU C Library is free software; you can redistribute it and/or
>> + modify it under the terms of the GNU Lesser General Public
>> + License as published by the Free Software Foundation; either
>> + version 2.1 of the License, or (at your option) any later version.
>> +
>> + The GNU C Library is distributed in the hope that it will be useful,
>> + but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + Lesser General Public License for more details.
>> +
>> + You should have received a copy of the GNU Lesser General Public
>> + License along with the GNU C Library; if not, see
>> + <http://www.gnu.org/licenses/>. */
>> +
>> +/* Define multiple versions only for the definition in libc. */
>> +#if IS_IN (libc)
>> +
>> +#include <sysdep.h>
>> +#include <shlib-compat.h>
>> +#include <init-arch.h>
>> +
>> +ENTRY(__wmemset)
>> + .type __wmemset, @gnu_indirect_function
>> + LOAD_RTLD_GLOBAL_RO_RDX
>> + lea __wmemset_sse2_unaligned(%rip), %RAX_LP
>> + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
>> + jnz 1f
>> + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
>> + jz 1f
>> + HAS_ARCH_FEATURE (AVX2_Usable)
>> + jz 1f
>> + lea __wmemset_avx2_unaligned(%rip), %RAX_LP
>> + HAS_ARCH_FEATURE (Prefer_No_AVX512)
>> + jnz 1f
>> + HAS_ARCH_FEATURE (AVX512F_Usable)
>> + jz 1f
>> + lea __wmemset_avx512_unaligned(%rip), %RAX_LP
>> +1: ret
>> +END(__wmemset)
>> +
>> +weak_alias (__wmemset, wmemset)
>> +#endif
>> diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.S
>>b/sysdeps/x86_64/multiarch/wmemset_chk.S
>> new file mode 100644
>> index 0000000..c76fcb1
>> --- /dev/null
>> +++ b/sysdeps/x86_64/multiarch/wmemset_chk.S
>> @@ -0,0 +1,46 @@
>> +/* Multiple versions of wmemset_chk
>> + All versions must be listed in ifunc-impl-list.c.
>> + Copyright (C) 2017 Free Software Foundation, Inc.
>> + This file is part of the GNU C Library.
>> +
>> + The GNU C Library is free software; you can redistribute it and/or
>> + modify it under the terms of the GNU Lesser General Public
>> + License as published by the Free Software Foundation; either
>> + version 2.1 of the License, or (at your option) any later version.
>> +
>> + The GNU C Library is distributed in the hope that it will be useful,
>> + but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + Lesser General Public License for more details.
>> +
>> + You should have received a copy of the GNU Lesser General Public
>> + License along with the GNU C Library; if not, see
>> + <http://www.gnu.org/licenses/>. */
>> +
>> +#include <sysdep.h>
>> +#include <shlib-compat.h>
>> +#include <init-arch.h>
>> +
>> +/* Define multiple versions only for the definition in libc. */
>> +#if IS_IN (libc)
>> +# ifdef SHARED
>> +ENTRY(__wmemset_chk)
>> + .type __wmemset_chk, @gnu_indirect_function
>> + LOAD_RTLD_GLOBAL_RO_RDX
>> + lea __wmemset_chk_sse2_unaligned(%rip), %RAX_LP
>> + HAS_ARCH_FEATURE (AVX2_Usable)
>> + jz 1f
>> + lea __wmemset_chk_avx2_unaligned(%rip), %RAX_LP
>> + HAS_ARCH_FEATURE (Prefer_No_AVX512)
>> + jnz 1f
>> + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
>> + jnz 1f
>> + HAS_ARCH_FEATURE (AVX512F_Usable)
>> + jz 1f
>> + lea __wmemset_chk_avx512_unaligned(%rip), %RAX_LP
>> +1: ret
>> +END(__wmemset_chk)
>> +# else
>> +# include "../wmemset_chk.S"
>> +# endif
>> +#endif
>> diff --git a/sysdeps/x86_64/wmemset.S b/sysdeps/x86_64/wmemset.S
>> new file mode 100644
>> index 0000000..f96d567
>> --- /dev/null
>> +++ b/sysdeps/x86_64/wmemset.S
>> @@ -0,0 +1 @@
>> +/* Implemented in memset.S. */
>> diff --git a/sysdeps/x86_64/wmemset_chk.S b/sysdeps/x86_64/wmemset_chk.S
>> new file mode 100644
>> index 0000000..64c2774
>> --- /dev/null
>> +++ b/sysdeps/x86_64/wmemset_chk.S
>> @@ -0,0 +1,33 @@
>> +/* Checking wmemset for x86-64.
>> + Copyright (C) 2004-2017 Free Software Foundation, Inc.
>> + This file is part of the GNU C Library.
>> +
>> + The GNU C Library is free software; you can redistribute it and/or
>> + modify it under the terms of the GNU Lesser General Public
>> + License as published by the Free Software Foundation; either
>> + version 2.1 of the License, or (at your option) any later version.
>> +
>> + The GNU C Library is distributed in the hope that it will be useful,
>> + but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + Lesser General Public License for more details.
>> +
>> + You should have received a copy of the GNU Lesser General Public
>> + License along with the GNU C Library; if not, see
>> + <http://www.gnu.org/licenses/>. */
>> +
>> +#include <sysdep.h>
>> +#include "asm-syntax.h"
>> +
>> +#ifndef SHARED
>> + /* For libc.so this is defined in wmemset.S.
>> + For libc.a, this is a separate source to avoid
>> + wmemset bringing in __chk_fail and all routines
>> + it calls. */
>> + .text
>> +ENTRY (__wmemset_chk)
>> + cmpq %rdx, %rcx
>> + jb __chk_fail
>> + jmp wmemset
>> +END (__wmemset_chk)
>> +#endif
>> --
>> 2.9.4
>>
>
>
>
>--
>H.J.
@@ -157,6 +157,9 @@ extern wchar_t *__wmemmove (wchar_t *__s1, const wchar_t *__s2,
extern wchar_t *__wcschrnul (const wchar_t *__s, wchar_t __wc)
__attribute_pure__;
+extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
+ size_t __ns) __THROW;
+
extern int __vfwscanf (__FILE *__restrict __s,
const wchar_t *__restrict __format,
__gnuc_va_list __arg)
@@ -26,13 +26,18 @@
#define VMOVU movdqu
#define VMOVA movdqa
-#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
movq r, %rax; \
punpcklbw %xmm0, %xmm0; \
punpcklwd %xmm0, %xmm0; \
pshufd $0, %xmm0, %xmm0
+#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ movd d, %xmm0; \
+ movq r, %rax; \
+ pshufd $0, %xmm0, %xmm0
+
#define SECTION(p) p
#ifndef MEMSET_SYMBOL
@@ -40,10 +45,21 @@
# define MEMSET_SYMBOL(p,s) memset
#endif
+#ifndef WMEMSET_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s) p
+# define WMEMSET_SYMBOL(p,s) __wmemset
+#endif
+
#include "multiarch/memset-vec-unaligned-erms.S"
libc_hidden_builtin_def (memset)
+#if IS_IN (libc)
+libc_hidden_def (__wmemset)
+weak_alias (__wmemset, wmemset)
+libc_hidden_weak (wmemset)
+#endif
+
#if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
.section .gnu.warning.__memset_zero_constant_len_parameter
@@ -300,6 +300,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wmemcmp_ssse3)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
+ /* Support sysdeps/x86_64/multiarch/wmemset.S. */
+ IFUNC_IMPL (i, name, wmemset,
+ IFUNC_IMPL_ADD (array, i, wmemset, 1,
+ __wmemset_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, wmemset,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wmemset_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, wmemset,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __wmemset_avx512_unaligned))
+
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */
IFUNC_IMPL (i, name, __memcpy_chk,
@@ -417,6 +428,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
__strncmp_ssse3)
IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
+
+ /* Support sysdeps/x86_64/multiarch/wmemset_chk.S. */
+ IFUNC_IMPL (i, name, __wmemset_chk,
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
+ __wmemset_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wmemset_chk_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __wmemset_chk_avx512_unaligned))
#endif
return i;
@@ -4,13 +4,19 @@
# define VMOVU vmovdqu
# define VMOVA vmovdqa
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
movq r, %rax; \
vpbroadcastb %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ vmovd d, %xmm0; \
+ movq r, %rax; \
+ vpbroadcastd %xmm0, %ymm0
+
# define SECTION(p) p##.avx
# define MEMSET_SYMBOL(p,s) p##_avx2_##s
+# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
# include "memset-vec-unaligned-erms.S"
#endif
@@ -4,14 +4,21 @@
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
movq r, %rax; \
vpbroadcastb %xmm0, %xmm0; \
vpbroadcastq %xmm0, %zmm0
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ vmovd d, %xmm0; \
+ movq r, %rax; \
+ vpbroadcastd %xmm0, %xmm0; \
+ vpbroadcastq %xmm0, %zmm0
+
# define SECTION(p) p##.avx512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
+# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
# include "memset-vec-unaligned-erms.S"
#endif
@@ -30,6 +30,10 @@
# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
#endif
+#ifndef WMEMSET_CHK_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
+#endif
+
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
@@ -79,6 +83,21 @@ END (__bzero)
weak_alias (__bzero, bzero)
#endif
+#if IS_IN (libc)
+# if defined SHARED
+ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+# endif
+
+ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+ shlq $2, %rdx
+ WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ jmp L(entry_from_bzero)
+END (WMEMSET_SYMBOL (__wmemset, unaligned))
+#endif
+
#if defined SHARED && IS_IN (libc)
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
cmpq %rdx, %rcx
@@ -87,8 +106,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
#endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
-L(memset_entry):
- VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
L(entry_from_bzero):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
@@ -132,7 +150,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
# endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
- VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
cmpq $(VEC_SIZE * 2), %rdx
@@ -58,16 +58,23 @@ END(memset)
#if IS_IN (libc)
# define MEMSET_SYMBOL(p,s) p##_sse2_##s
+# define WMEMSET_SYMBOL(p,s) p##_sse2_##s
# ifdef SHARED
-# undef libc_hidden_builtin_def
+# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal memset calls through a PLT.
The speedup we get from using SSE2 instructions is likely eaten away
by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_memset; __GI_memset = __memset_sse2_unaligned
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
+ .globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
+ .globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
# endif
+# undef weak_alias
+# define weak_alias(original, alias) \
+ .weak bzero; bzero = __bzero
+
# undef strong_alias
# define strong_alias(original, alias)
#endif
new file mode 100644
@@ -0,0 +1,47 @@
+/* Multiple versions of wmemset
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+ENTRY(__wmemset)
+ .type __wmemset, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ lea __wmemset_sse2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ lea __wmemset_avx2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Prefer_No_AVX512)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX512F_Usable)
+ jz 1f
+ lea __wmemset_avx512_unaligned(%rip), %RAX_LP
+1: ret
+END(__wmemset)
+
+weak_alias (__wmemset, wmemset)
+#endif
new file mode 100644
@@ -0,0 +1,46 @@
+/* Multiple versions of wmemset_chk
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# ifdef SHARED
+ENTRY(__wmemset_chk)
+ .type __wmemset_chk, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ lea __wmemset_chk_sse2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ lea __wmemset_chk_avx2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Prefer_No_AVX512)
+ jnz 1f
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX512F_Usable)
+ jz 1f
+ lea __wmemset_chk_avx512_unaligned(%rip), %RAX_LP
+1: ret
+END(__wmemset_chk)
+# else
+# include "../wmemset_chk.S"
+# endif
+#endif
new file mode 100644
@@ -0,0 +1 @@
+/* Implemented in memset.S. */
new file mode 100644
@@ -0,0 +1,33 @@
+/* Checking wmemset for x86-64.
+ Copyright (C) 2004-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef SHARED
+ /* For libc.so this is defined in wmemset.S.
+ For libc.a, this is a separate source to avoid
+ wmemset bringing in __chk_fail and all routines
+ it calls. */
+ .text
+ENTRY (__wmemset_chk)
+ cmpq %rdx, %rcx
+ jb __chk_fail
+ jmp wmemset
+END (__wmemset_chk)
+#endif