x86-64: Add wmemset optimized with SSE2/AVX2/AVX512

Message ID 20170521203442.GA20131@gmail.com
State New, archived
Headers

Commit Message

H.J. Lu May 21, 2017, 8:34 p.m. UTC
  The difference between memset and wmemset is byte vs int.  Add stubs
to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size:

SSE2 wmemset:
	shl    $0x2,%rdx
	movd   %esi,%xmm0
	mov    %rdi,%rax
	pshufd $0x0,%xmm0,%xmm0
	jmp	entry_from_wmemset

SSE2 memset:
	movd   %esi,%xmm0
	mov    %rdi,%rax
	punpcklbw %xmm0,%xmm0
	punpcklwd %xmm0,%xmm0
	pshufd $0x0,%xmm0,%xmm0
entry_from_wmemset:

Since the ERMS versions of wmemset requires "rep stosl" instead of
"rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset
are added.  The SSE2 wmemset is about 3X faster and the AVX2 wmemset
is about 6X faster on Haswell.

OK for master?

H.J.
---
	* include/wchar.h (__wmemset_chk): New.
	* sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed
	to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
	(WMEMSET_CHK_SYMBOL): Likewise.
	(WMEMSET_SYMBOL): Likewise.
	(__wmemset): Add hidden definition.
	(wmemset): Add weak hidden definition.
	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned,
	__wmemset_avx2_unaligned, __wmemset_avx512_unaligned,
	__wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned
	and __wmemset_chk_avx512_unaligned.
	* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
	(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
	(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
	(WMEMSET_SYMBOL): Likewise.
	* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
	(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
	(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
	(WMEMSET_SYMBOL): Likewise.
	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated.
	(WMEMSET_CHK_SYMBOL): New.
	(WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise.
	(WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise.
	* sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New.
	(libc_hidden_builtin_def): Also define __GI_wmemset and
	__GI___wmemset.
	(weak_alias): New.
	* sysdeps/x86_64/multiarch/wmemset.S: New file.
	* sysdeps/x86_64/multiarch/wmemset_chk.S: Likewise.
	* sysdeps/x86_64/wmemset.S: Likewise.
	* sysdeps/x86_64/wmemset_chk.S: Likewise.
---
 include/wchar.h                                    |  3 ++
 sysdeps/x86_64/memset.S                            | 18 ++++++++-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c         | 22 ++++++++++
 .../x86_64/multiarch/memset-avx2-unaligned-erms.S  |  8 +++-
 .../multiarch/memset-avx512-unaligned-erms.S       |  9 ++++-
 .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 24 +++++++++--
 sysdeps/x86_64/multiarch/memset.S                  | 13 ++++--
 sysdeps/x86_64/multiarch/wmemset.S                 | 47 ++++++++++++++++++++++
 sysdeps/x86_64/multiarch/wmemset_chk.S             | 46 +++++++++++++++++++++
 sysdeps/x86_64/wmemset.S                           |  1 +
 sysdeps/x86_64/wmemset_chk.S                       | 33 +++++++++++++++
 11 files changed, 215 insertions(+), 9 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/wmemset.S
 create mode 100644 sysdeps/x86_64/multiarch/wmemset_chk.S
 create mode 100644 sysdeps/x86_64/wmemset.S
 create mode 100644 sysdeps/x86_64/wmemset_chk.S
  

Comments

H.J. Lu May 30, 2017, 11:41 p.m. UTC | #1
On Sun, May 21, 2017 at 1:34 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> The difference between memset and wmemset is byte vs int.  Add stubs
> to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size:
>
> SSE2 wmemset:
>         shl    $0x2,%rdx
>         movd   %esi,%xmm0
>         mov    %rdi,%rax
>         pshufd $0x0,%xmm0,%xmm0
>         jmp     entry_from_wmemset
>
> SSE2 memset:
>         movd   %esi,%xmm0
>         mov    %rdi,%rax
>         punpcklbw %xmm0,%xmm0
>         punpcklwd %xmm0,%xmm0
>         pshufd $0x0,%xmm0,%xmm0
> entry_from_wmemset:
>
> Since the ERMS versions of wmemset requires "rep stosl" instead of
> "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset
> are added.  The SSE2 wmemset is about 3X faster and the AVX2 wmemset
> is about 6X faster on Haswell.
>
> OK for master?

Any objections?

> H.J.
> ---
>         * include/wchar.h (__wmemset_chk): New.
>         * sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed
>         to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
>         (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
>         (WMEMSET_CHK_SYMBOL): Likewise.
>         (WMEMSET_SYMBOL): Likewise.
>         (__wmemset): Add hidden definition.
>         (wmemset): Add weak hidden definition.
>         * sysdeps/x86_64/multiarch/ifunc-impl-list.c
>         (__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned,
>         __wmemset_avx2_unaligned, __wmemset_avx512_unaligned,
>         __wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned
>         and __wmemset_chk_avx512_unaligned.
>         * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>         (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
>         (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
>         (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
>         (WMEMSET_SYMBOL): Likewise.
>         * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>         (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
>         (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
>         (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
>         (WMEMSET_SYMBOL): Likewise.
>         * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated.
>         (WMEMSET_CHK_SYMBOL): New.
>         (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise.
>         (WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise.
>         * sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New.
>         (libc_hidden_builtin_def): Also define __GI_wmemset and
>         __GI___wmemset.
>         (weak_alias): New.
>         * sysdeps/x86_64/multiarch/wmemset.S: New file.
>         * sysdeps/x86_64/multiarch/wmemset_chk.S: Likewise.
>         * sysdeps/x86_64/wmemset.S: Likewise.
>         * sysdeps/x86_64/wmemset_chk.S: Likewise.
> ---
>  include/wchar.h                                    |  3 ++
>  sysdeps/x86_64/memset.S                            | 18 ++++++++-
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c         | 22 ++++++++++
>  .../x86_64/multiarch/memset-avx2-unaligned-erms.S  |  8 +++-
>  .../multiarch/memset-avx512-unaligned-erms.S       |  9 ++++-
>  .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 24 +++++++++--
>  sysdeps/x86_64/multiarch/memset.S                  | 13 ++++--
>  sysdeps/x86_64/multiarch/wmemset.S                 | 47 ++++++++++++++++++++++
>  sysdeps/x86_64/multiarch/wmemset_chk.S             | 46 +++++++++++++++++++++
>  sysdeps/x86_64/wmemset.S                           |  1 +
>  sysdeps/x86_64/wmemset_chk.S                       | 33 +++++++++++++++
>  11 files changed, 215 insertions(+), 9 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/wmemset.S
>  create mode 100644 sysdeps/x86_64/multiarch/wmemset_chk.S
>  create mode 100644 sysdeps/x86_64/wmemset.S
>  create mode 100644 sysdeps/x86_64/wmemset_chk.S
>
> diff --git a/include/wchar.h b/include/wchar.h
> index e2579a1..a773d56 100644
> --- a/include/wchar.h
> +++ b/include/wchar.h
> @@ -157,6 +157,9 @@ extern wchar_t *__wmemmove (wchar_t *__s1, const wchar_t *__s2,
>  extern wchar_t *__wcschrnul (const wchar_t *__s, wchar_t __wc)
>       __attribute_pure__;
>
> +extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
> +                              size_t __ns) __THROW;
> +
>  extern int __vfwscanf (__FILE *__restrict __s,
>                        const wchar_t *__restrict __format,
>                        __gnuc_va_list __arg)
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index 69ed509..4127878 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -26,13 +26,18 @@
>  #define VMOVU          movdqu
>  #define VMOVA          movdqa
>
> -#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>    movd d, %xmm0; \
>    movq r, %rax; \
>    punpcklbw %xmm0, %xmm0; \
>    punpcklwd %xmm0, %xmm0; \
>    pshufd $0, %xmm0, %xmm0
>
> +#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +  movd d, %xmm0; \
> +  movq r, %rax; \
> +  pshufd $0, %xmm0, %xmm0
> +
>  #define SECTION(p)             p
>
>  #ifndef MEMSET_SYMBOL
> @@ -40,10 +45,21 @@
>  # define MEMSET_SYMBOL(p,s)    memset
>  #endif
>
> +#ifndef WMEMSET_SYMBOL
> +# define WMEMSET_CHK_SYMBOL(p,s) p
> +# define WMEMSET_SYMBOL(p,s)   __wmemset
> +#endif
> +
>  #include "multiarch/memset-vec-unaligned-erms.S"
>
>  libc_hidden_builtin_def (memset)
>
> +#if IS_IN (libc)
> +libc_hidden_def (__wmemset)
> +weak_alias (__wmemset, wmemset)
> +libc_hidden_weak (wmemset)
> +#endif
> +
>  #if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
>  strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
>         .section .gnu.warning.__memset_zero_constant_len_parameter
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 06d9a9d..a91d2f9 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -300,6 +300,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __wmemcmp_ssse3)
>               IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
>
> +  /* Support sysdeps/x86_64/multiarch/wmemset.S.  */
> +  IFUNC_IMPL (i, name, wmemset,
> +             IFUNC_IMPL_ADD (array, i, wmemset, 1,
> +                             __wmemset_sse2_unaligned)
> +             IFUNC_IMPL_ADD (array, i, wmemset,
> +                             HAS_ARCH_FEATURE (AVX2_Usable),
> +                             __wmemset_avx2_unaligned)
> +             IFUNC_IMPL_ADD (array, i, wmemset,
> +                             HAS_ARCH_FEATURE (AVX512F_Usable),
> +                             __wmemset_avx512_unaligned))
> +
>  #ifdef SHARED
>    /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
>    IFUNC_IMPL (i, name, __memcpy_chk,
> @@ -417,6 +428,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
>                               __strncmp_ssse3)
>               IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
> +
> +  /* Support sysdeps/x86_64/multiarch/wmemset_chk.S.  */
> +  IFUNC_IMPL (i, name, __wmemset_chk,
> +             IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
> +                             __wmemset_chk_sse2_unaligned)
> +             IFUNC_IMPL_ADD (array, i, __wmemset_chk,
> +                             HAS_ARCH_FEATURE (AVX2_Usable),
> +                             __wmemset_chk_avx2_unaligned)
> +             IFUNC_IMPL_ADD (array, i, __wmemset_chk,
> +                             HAS_ARCH_FEATURE (AVX512F_Usable),
> +                             __wmemset_chk_avx512_unaligned))
>  #endif
>
>    return i;
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index 79975e0..7ab3d89 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -4,13 +4,19 @@
>  # define VMOVU         vmovdqu
>  # define VMOVA         vmovdqa
>
> -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>    vmovd d, %xmm0; \
>    movq r, %rax; \
>    vpbroadcastb %xmm0, %ymm0
>
> +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +  vmovd d, %xmm0; \
> +  movq r, %rax; \
> +  vpbroadcastd %xmm0, %ymm0
> +
>  # define SECTION(p)            p##.avx
>  # define MEMSET_SYMBOL(p,s)    p##_avx2_##s
> +# define WMEMSET_SYMBOL(p,s)   p##_avx2_##s
>
>  # include "memset-vec-unaligned-erms.S"
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index a5ec349..0783979 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -4,14 +4,21 @@
>  # define VMOVU         vmovdqu64
>  # define VMOVA         vmovdqa64
>
> -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>    vmovd d, %xmm0; \
>    movq r, %rax; \
>    vpbroadcastb %xmm0, %xmm0; \
>    vpbroadcastq %xmm0, %zmm0
>
> +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +  vmovd d, %xmm0; \
> +  movq r, %rax; \
> +  vpbroadcastd %xmm0, %xmm0; \
> +  vpbroadcastq %xmm0, %zmm0
> +
>  # define SECTION(p)            p##.avx512
>  # define MEMSET_SYMBOL(p,s)    p##_avx512_##s
> +# define WMEMSET_SYMBOL(p,s)   p##_avx512_##s
>
>  # include "memset-vec-unaligned-erms.S"
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 704eed9..2eb9e37 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -30,6 +30,10 @@
>  # define MEMSET_CHK_SYMBOL(p,s)                MEMSET_SYMBOL(p, s)
>  #endif
>
> +#ifndef WMEMSET_CHK_SYMBOL
> +# define WMEMSET_CHK_SYMBOL(p,s)       WMEMSET_SYMBOL(p, s)
> +#endif
> +
>  #ifndef VZEROUPPER
>  # if VEC_SIZE > 16
>  #  define VZEROUPPER                   vzeroupper
> @@ -79,6 +83,21 @@ END (__bzero)
>  weak_alias (__bzero, bzero)
>  #endif
>
> +#if IS_IN (libc)
> +# if defined SHARED
> +ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
> +       cmpq    %rdx, %rcx
> +       jb      HIDDEN_JUMPTARGET (__chk_fail)
> +END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
> +# endif
> +
> +ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> +       shlq    $2, %rdx
> +       WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> +       jmp     L(entry_from_bzero)
> +END (WMEMSET_SYMBOL (__wmemset, unaligned))
> +#endif
> +
>  #if defined SHARED && IS_IN (libc)
>  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>         cmpq    %rdx, %rcx
> @@ -87,8 +106,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>  #endif
>
>  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> -L(memset_entry):
> -       VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> +       MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>  L(entry_from_bzero):
>         cmpq    $VEC_SIZE, %rdx
>         jb      L(less_vec)
> @@ -132,7 +150,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>  # endif
>
>  ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
> -       VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> +       MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>         cmpq    $VEC_SIZE, %rdx
>         jb      L(less_vec)
>         cmpq    $(VEC_SIZE * 2), %rdx
> diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
> index 9d33118..11f2737 100644
> --- a/sysdeps/x86_64/multiarch/memset.S
> +++ b/sysdeps/x86_64/multiarch/memset.S
> @@ -58,16 +58,23 @@ END(memset)
>
>  #if IS_IN (libc)
>  # define MEMSET_SYMBOL(p,s)    p##_sse2_##s
> +# define WMEMSET_SYMBOL(p,s)   p##_sse2_##s
>
>  # ifdef SHARED
> -# undef libc_hidden_builtin_def
> +#  undef libc_hidden_builtin_def
>  /* It doesn't make sense to send libc-internal memset calls through a PLT.
>     The speedup we get from using SSE2 instructions is likely eaten away
>     by the indirect call in the PLT.  */
> -# define libc_hidden_builtin_def(name) \
> -       .globl __GI_memset; __GI_memset = __memset_sse2_unaligned
> +#  define libc_hidden_builtin_def(name) \
> +       .globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
> +       .globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
> +       .globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
>  # endif
>
> +# undef weak_alias
> +# define weak_alias(original, alias) \
> +       .weak bzero; bzero = __bzero
> +
>  # undef strong_alias
>  # define strong_alias(original, alias)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/wmemset.S b/sysdeps/x86_64/multiarch/wmemset.S
> new file mode 100644
> index 0000000..3bd7ca2
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wmemset.S
> @@ -0,0 +1,47 @@
> +/* Multiple versions of wmemset
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc. */
> +#if IS_IN (libc)
> +
> +#include <sysdep.h>
> +#include <shlib-compat.h>
> +#include <init-arch.h>
> +
> +ENTRY(__wmemset)
> +       .type   __wmemset, @gnu_indirect_function
> +       LOAD_RTLD_GLOBAL_RO_RDX
> +       lea     __wmemset_sse2_unaligned(%rip), %RAX_LP
> +       HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
> +       jnz     1f
> +       HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
> +       jz      1f
> +       HAS_ARCH_FEATURE (AVX2_Usable)
> +       jz      1f
> +       lea     __wmemset_avx2_unaligned(%rip), %RAX_LP
> +       HAS_ARCH_FEATURE (Prefer_No_AVX512)
> +       jnz     1f
> +       HAS_ARCH_FEATURE (AVX512F_Usable)
> +       jz      1f
> +       lea     __wmemset_avx512_unaligned(%rip), %RAX_LP
> +1:     ret
> +END(__wmemset)
> +
> +weak_alias (__wmemset, wmemset)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.S b/sysdeps/x86_64/multiarch/wmemset_chk.S
> new file mode 100644
> index 0000000..c76fcb1
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wmemset_chk.S
> @@ -0,0 +1,46 @@
> +/* Multiple versions of wmemset_chk
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <shlib-compat.h>
> +#include <init-arch.h>
> +
> +/* Define multiple versions only for the definition in libc. */
> +#if IS_IN (libc)
> +# ifdef SHARED
> +ENTRY(__wmemset_chk)
> +       .type   __wmemset_chk, @gnu_indirect_function
> +       LOAD_RTLD_GLOBAL_RO_RDX
> +       lea     __wmemset_chk_sse2_unaligned(%rip), %RAX_LP
> +       HAS_ARCH_FEATURE (AVX2_Usable)
> +       jz      1f
> +       lea     __wmemset_chk_avx2_unaligned(%rip), %RAX_LP
> +       HAS_ARCH_FEATURE (Prefer_No_AVX512)
> +       jnz     1f
> +       HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
> +       jnz     1f
> +       HAS_ARCH_FEATURE (AVX512F_Usable)
> +       jz      1f
> +       lea     __wmemset_chk_avx512_unaligned(%rip), %RAX_LP
> +1:     ret
> +END(__wmemset_chk)
> +# else
> +#  include "../wmemset_chk.S"
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/wmemset.S b/sysdeps/x86_64/wmemset.S
> new file mode 100644
> index 0000000..f96d567
> --- /dev/null
> +++ b/sysdeps/x86_64/wmemset.S
> @@ -0,0 +1 @@
> +/* Implemented in memset.S.  */
> diff --git a/sysdeps/x86_64/wmemset_chk.S b/sysdeps/x86_64/wmemset_chk.S
> new file mode 100644
> index 0000000..64c2774
> --- /dev/null
> +++ b/sysdeps/x86_64/wmemset_chk.S
> @@ -0,0 +1,33 @@
> +/* Checking wmemset for x86-64.
> +   Copyright (C) 2004-2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "asm-syntax.h"
> +
> +#ifndef SHARED
> +       /* For libc.so this is defined in wmemset.S.
> +          For libc.a, this is a separate source to avoid
> +          wmemset bringing in __chk_fail and all routines
> +          it calls.  */
> +        .text
> +ENTRY (__wmemset_chk)
> +       cmpq    %rdx, %rcx
> +       jb      __chk_fail
> +       jmp     wmemset
> +END (__wmemset_chk)
> +#endif
> --
> 2.9.4
>
  
Victor Rodriguez May 31, 2017, 11:29 a.m. UTC | #2
+1




-----Original Message-----
From: <libc-alpha-owner@sourceware.org> on behalf of "H.J. Lu"
<hjl.tools@gmail.com>
Date: Tuesday, May 30, 2017 at 6:41 PM
To: GNU C Library <libc-alpha@sourceware.org>
Subject: Re: [PATCH] x86-64: Add wmemset optimized with SSE2/AVX2/AVX512

>On Sun, May 21, 2017 at 1:34 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> The difference between memset and wmemset is byte vs int.  Add stubs
>> to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size:
>>
>> SSE2 wmemset:
>>         shl    $0x2,%rdx
>>         movd   %esi,%xmm0
>>         mov    %rdi,%rax
>>         pshufd $0x0,%xmm0,%xmm0
>>         jmp     entry_from_wmemset
>>
>> SSE2 memset:
>>         movd   %esi,%xmm0
>>         mov    %rdi,%rax
>>         punpcklbw %xmm0,%xmm0
>>         punpcklwd %xmm0,%xmm0
>>         pshufd $0x0,%xmm0,%xmm0
>> entry_from_wmemset:
>>
>> Since the ERMS versions of wmemset requires "rep stosl" instead of
>> "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset
>> are added.  The SSE2 wmemset is about 3X faster and the AVX2 wmemset
>> is about 6X faster on Haswell.
>>
>> OK for master?
>
>Any objections?
>
>> H.J.
>> ---
>>         * include/wchar.h (__wmemset_chk): New.
>>         * sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed
>>         to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
>>         (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
>>         (WMEMSET_CHK_SYMBOL): Likewise.
>>         (WMEMSET_SYMBOL): Likewise.
>>         (__wmemset): Add hidden definition.
>>         (wmemset): Add weak hidden definition.
>>         * sysdeps/x86_64/multiarch/ifunc-impl-list.c
>>         (__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned,
>>         __wmemset_avx2_unaligned, __wmemset_avx512_unaligned,
>>         __wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned
>>         and __wmemset_chk_avx512_unaligned.
>>         * sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>>         (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
>>         (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
>>         (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
>>         (WMEMSET_SYMBOL): Likewise.
>>         * sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>>         (VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
>>         (MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
>>         (WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
>>         (WMEMSET_SYMBOL): Likewise.
>>         * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated.
>>         (WMEMSET_CHK_SYMBOL): New.
>>         (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise.
>>         (WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise.
>>         * sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New.
>>         (libc_hidden_builtin_def): Also define __GI_wmemset and
>>         __GI___wmemset.
>>         (weak_alias): New.
>>         * sysdeps/x86_64/multiarch/wmemset.S: New file.
>>         * sysdeps/x86_64/multiarch/wmemset_chk.S: Likewise.
>>         * sysdeps/x86_64/wmemset.S: Likewise.
>>         * sysdeps/x86_64/wmemset_chk.S: Likewise.
>> ---
>>  include/wchar.h                                    |  3 ++
>>  sysdeps/x86_64/memset.S                            | 18 ++++++++-
>>  sysdeps/x86_64/multiarch/ifunc-impl-list.c         | 22 ++++++++++
>>  .../x86_64/multiarch/memset-avx2-unaligned-erms.S  |  8 +++-
>>  .../multiarch/memset-avx512-unaligned-erms.S       |  9 ++++-
>>  .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 24 +++++++++--
>>  sysdeps/x86_64/multiarch/memset.S                  | 13 ++++--
>>  sysdeps/x86_64/multiarch/wmemset.S                 | 47
>>++++++++++++++++++++++
>>  sysdeps/x86_64/multiarch/wmemset_chk.S             | 46
>>+++++++++++++++++++++
>>  sysdeps/x86_64/wmemset.S                           |  1 +
>>  sysdeps/x86_64/wmemset_chk.S                       | 33 +++++++++++++++
>>  11 files changed, 215 insertions(+), 9 deletions(-)
>>  create mode 100644 sysdeps/x86_64/multiarch/wmemset.S
>>  create mode 100644 sysdeps/x86_64/multiarch/wmemset_chk.S
>>  create mode 100644 sysdeps/x86_64/wmemset.S
>>  create mode 100644 sysdeps/x86_64/wmemset_chk.S
>>
>> diff --git a/include/wchar.h b/include/wchar.h
>> index e2579a1..a773d56 100644
>> --- a/include/wchar.h
>> +++ b/include/wchar.h
>> @@ -157,6 +157,9 @@ extern wchar_t *__wmemmove (wchar_t *__s1, const
>>wchar_t *__s2,
>>  extern wchar_t *__wcschrnul (const wchar_t *__s, wchar_t __wc)
>>       __attribute_pure__;
>>
>> +extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
>> +                              size_t __ns) __THROW;
>> +
>>  extern int __vfwscanf (__FILE *__restrict __s,
>>                        const wchar_t *__restrict __format,
>>                        __gnuc_va_list __arg)
>> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
>> index 69ed509..4127878 100644
>> --- a/sysdeps/x86_64/memset.S
>> +++ b/sysdeps/x86_64/memset.S
>> @@ -26,13 +26,18 @@
>>  #define VMOVU          movdqu
>>  #define VMOVA          movdqa
>>
>> -#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> +#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>>    movd d, %xmm0; \
>>    movq r, %rax; \
>>    punpcklbw %xmm0, %xmm0; \
>>    punpcklwd %xmm0, %xmm0; \
>>    pshufd $0, %xmm0, %xmm0
>>
>> +#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> +  movd d, %xmm0; \
>> +  movq r, %rax; \
>> +  pshufd $0, %xmm0, %xmm0
>> +
>>  #define SECTION(p)             p
>>
>>  #ifndef MEMSET_SYMBOL
>> @@ -40,10 +45,21 @@
>>  # define MEMSET_SYMBOL(p,s)    memset
>>  #endif
>>
>> +#ifndef WMEMSET_SYMBOL
>> +# define WMEMSET_CHK_SYMBOL(p,s) p
>> +# define WMEMSET_SYMBOL(p,s)   __wmemset
>> +#endif
>> +
>>  #include "multiarch/memset-vec-unaligned-erms.S"
>>
>>  libc_hidden_builtin_def (memset)
>>
>> +#if IS_IN (libc)
>> +libc_hidden_def (__wmemset)
>> +weak_alias (__wmemset, wmemset)
>> +libc_hidden_weak (wmemset)
>> +#endif
>> +
>>  #if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
>>  strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
>>         .section .gnu.warning.__memset_zero_constant_len_parameter
>> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>>b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> index 06d9a9d..a91d2f9 100644
>> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> @@ -300,6 +300,17 @@ __libc_ifunc_impl_list (const char *name, struct
>>libc_ifunc_impl *array,
>>                               __wmemcmp_ssse3)
>>               IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
>>
>> +  /* Support sysdeps/x86_64/multiarch/wmemset.S.  */
>> +  IFUNC_IMPL (i, name, wmemset,
>> +             IFUNC_IMPL_ADD (array, i, wmemset, 1,
>> +                             __wmemset_sse2_unaligned)
>> +             IFUNC_IMPL_ADD (array, i, wmemset,
>> +                             HAS_ARCH_FEATURE (AVX2_Usable),
>> +                             __wmemset_avx2_unaligned)
>> +             IFUNC_IMPL_ADD (array, i, wmemset,
>> +                             HAS_ARCH_FEATURE (AVX512F_Usable),
>> +                             __wmemset_avx512_unaligned))
>> +
>>  #ifdef SHARED
>>    /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
>>    IFUNC_IMPL (i, name, __memcpy_chk,
>> @@ -417,6 +428,17 @@ __libc_ifunc_impl_list (const char *name, struct
>>libc_ifunc_impl *array,
>>               IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE
>>(SSSE3),
>>                               __strncmp_ssse3)
>>               IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
>> +
>> +  /* Support sysdeps/x86_64/multiarch/wmemset_chk.S.  */
>> +  IFUNC_IMPL (i, name, __wmemset_chk,
>> +             IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
>> +                             __wmemset_chk_sse2_unaligned)
>> +             IFUNC_IMPL_ADD (array, i, __wmemset_chk,
>> +                             HAS_ARCH_FEATURE (AVX2_Usable),
>> +                             __wmemset_chk_avx2_unaligned)
>> +             IFUNC_IMPL_ADD (array, i, __wmemset_chk,
>> +                             HAS_ARCH_FEATURE (AVX512F_Usable),
>> +                             __wmemset_chk_avx512_unaligned))
>>  #endif
>>
>>    return i;
>> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>>b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> index 79975e0..7ab3d89 100644
>> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> @@ -4,13 +4,19 @@
>>  # define VMOVU         vmovdqu
>>  # define VMOVA         vmovdqa
>>
>> -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>>    vmovd d, %xmm0; \
>>    movq r, %rax; \
>>    vpbroadcastb %xmm0, %ymm0
>>
>> +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> +  vmovd d, %xmm0; \
>> +  movq r, %rax; \
>> +  vpbroadcastd %xmm0, %ymm0
>> +
>>  # define SECTION(p)            p##.avx
>>  # define MEMSET_SYMBOL(p,s)    p##_avx2_##s
>> +# define WMEMSET_SYMBOL(p,s)   p##_avx2_##s
>>
>>  # include "memset-vec-unaligned-erms.S"
>>  #endif
>> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>>b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> index a5ec349..0783979 100644
>> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> @@ -4,14 +4,21 @@
>>  # define VMOVU         vmovdqu64
>>  # define VMOVA         vmovdqa64
>>
>> -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>>    vmovd d, %xmm0; \
>>    movq r, %rax; \
>>    vpbroadcastb %xmm0, %xmm0; \
>>    vpbroadcastq %xmm0, %zmm0
>>
>> +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> +  vmovd d, %xmm0; \
>> +  movq r, %rax; \
>> +  vpbroadcastd %xmm0, %xmm0; \
>> +  vpbroadcastq %xmm0, %zmm0
>> +
>>  # define SECTION(p)            p##.avx512
>>  # define MEMSET_SYMBOL(p,s)    p##_avx512_##s
>> +# define WMEMSET_SYMBOL(p,s)   p##_avx512_##s
>>
>>  # include "memset-vec-unaligned-erms.S"
>>  #endif
>> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>>b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> index 704eed9..2eb9e37 100644
>> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> @@ -30,6 +30,10 @@
>>  # define MEMSET_CHK_SYMBOL(p,s)                MEMSET_SYMBOL(p, s)
>>  #endif
>>
>> +#ifndef WMEMSET_CHK_SYMBOL
>> +# define WMEMSET_CHK_SYMBOL(p,s)       WMEMSET_SYMBOL(p, s)
>> +#endif
>> +
>>  #ifndef VZEROUPPER
>>  # if VEC_SIZE > 16
>>  #  define VZEROUPPER                   vzeroupper
>> @@ -79,6 +83,21 @@ END (__bzero)
>>  weak_alias (__bzero, bzero)
>>  #endif
>>
>> +#if IS_IN (libc)
>> +# if defined SHARED
>> +ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
>> +       cmpq    %rdx, %rcx
>> +       jb      HIDDEN_JUMPTARGET (__chk_fail)
>> +END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
>> +# endif
>> +
>> +ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
>> +       shlq    $2, %rdx
>> +       WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>> +       jmp     L(entry_from_bzero)
>> +END (WMEMSET_SYMBOL (__wmemset, unaligned))
>> +#endif
>> +
>>  #if defined SHARED && IS_IN (libc)
>>  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>>         cmpq    %rdx, %rcx
>> @@ -87,8 +106,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>>  #endif
>>
>>  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
>> -L(memset_entry):
>> -       VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>> +       MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>>  L(entry_from_bzero):
>>         cmpq    $VEC_SIZE, %rdx
>>         jb      L(less_vec)
>> @@ -132,7 +150,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk,
>>unaligned_erms))
>>  # endif
>>
>>  ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
>> -       VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>> +       MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>>         cmpq    $VEC_SIZE, %rdx
>>         jb      L(less_vec)
>>         cmpq    $(VEC_SIZE * 2), %rdx
>> diff --git a/sysdeps/x86_64/multiarch/memset.S
>>b/sysdeps/x86_64/multiarch/memset.S
>> index 9d33118..11f2737 100644
>> --- a/sysdeps/x86_64/multiarch/memset.S
>> +++ b/sysdeps/x86_64/multiarch/memset.S
>> @@ -58,16 +58,23 @@ END(memset)
>>
>>  #if IS_IN (libc)
>>  # define MEMSET_SYMBOL(p,s)    p##_sse2_##s
>> +# define WMEMSET_SYMBOL(p,s)   p##_sse2_##s
>>
>>  # ifdef SHARED
>> -# undef libc_hidden_builtin_def
>> +#  undef libc_hidden_builtin_def
>>  /* It doesn't make sense to send libc-internal memset calls through a
>>PLT.
>>     The speedup we get from using SSE2 instructions is likely eaten away
>>     by the indirect call in the PLT.  */
>> -# define libc_hidden_builtin_def(name) \
>> -       .globl __GI_memset; __GI_memset = __memset_sse2_unaligned
>> +#  define libc_hidden_builtin_def(name) \
>> +       .globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
>> +       .globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
>> +       .globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
>>  # endif
>>
>> +# undef weak_alias
>> +# define weak_alias(original, alias) \
>> +       .weak bzero; bzero = __bzero
>> +
>>  # undef strong_alias
>>  # define strong_alias(original, alias)
>>  #endif
>> diff --git a/sysdeps/x86_64/multiarch/wmemset.S
>>b/sysdeps/x86_64/multiarch/wmemset.S
>> new file mode 100644
>> index 0000000..3bd7ca2
>> --- /dev/null
>> +++ b/sysdeps/x86_64/multiarch/wmemset.S
>> @@ -0,0 +1,47 @@
>> +/* Multiple versions of wmemset
>> +   All versions must be listed in ifunc-impl-list.c.
>> +   Copyright (C) 2017 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +/* Define multiple versions only for the definition in libc. */
>> +#if IS_IN (libc)
>> +
>> +#include <sysdep.h>
>> +#include <shlib-compat.h>
>> +#include <init-arch.h>
>> +
>> +ENTRY(__wmemset)
>> +       .type   __wmemset, @gnu_indirect_function
>> +       LOAD_RTLD_GLOBAL_RO_RDX
>> +       lea     __wmemset_sse2_unaligned(%rip), %RAX_LP
>> +       HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
>> +       jnz     1f
>> +       HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
>> +       jz      1f
>> +       HAS_ARCH_FEATURE (AVX2_Usable)
>> +       jz      1f
>> +       lea     __wmemset_avx2_unaligned(%rip), %RAX_LP
>> +       HAS_ARCH_FEATURE (Prefer_No_AVX512)
>> +       jnz     1f
>> +       HAS_ARCH_FEATURE (AVX512F_Usable)
>> +       jz      1f
>> +       lea     __wmemset_avx512_unaligned(%rip), %RAX_LP
>> +1:     ret
>> +END(__wmemset)
>> +
>> +weak_alias (__wmemset, wmemset)
>> +#endif
>> diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.S
>>b/sysdeps/x86_64/multiarch/wmemset_chk.S
>> new file mode 100644
>> index 0000000..c76fcb1
>> --- /dev/null
>> +++ b/sysdeps/x86_64/multiarch/wmemset_chk.S
>> @@ -0,0 +1,46 @@
>> +/* Multiple versions of wmemset_chk
>> +   All versions must be listed in ifunc-impl-list.c.
>> +   Copyright (C) 2017 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#include <sysdep.h>
>> +#include <shlib-compat.h>
>> +#include <init-arch.h>
>> +
>> +/* Define multiple versions only for the definition in libc. */
>> +#if IS_IN (libc)
>> +# ifdef SHARED
>> +ENTRY(__wmemset_chk)
>> +       .type   __wmemset_chk, @gnu_indirect_function
>> +       LOAD_RTLD_GLOBAL_RO_RDX
>> +       lea     __wmemset_chk_sse2_unaligned(%rip), %RAX_LP
>> +       HAS_ARCH_FEATURE (AVX2_Usable)
>> +       jz      1f
>> +       lea     __wmemset_chk_avx2_unaligned(%rip), %RAX_LP
>> +       HAS_ARCH_FEATURE (Prefer_No_AVX512)
>> +       jnz     1f
>> +       HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
>> +       jnz     1f
>> +       HAS_ARCH_FEATURE (AVX512F_Usable)
>> +       jz      1f
>> +       lea     __wmemset_chk_avx512_unaligned(%rip), %RAX_LP
>> +1:     ret
>> +END(__wmemset_chk)
>> +# else
>> +#  include "../wmemset_chk.S"
>> +# endif
>> +#endif
>> diff --git a/sysdeps/x86_64/wmemset.S b/sysdeps/x86_64/wmemset.S
>> new file mode 100644
>> index 0000000..f96d567
>> --- /dev/null
>> +++ b/sysdeps/x86_64/wmemset.S
>> @@ -0,0 +1 @@
>> +/* Implemented in memset.S.  */
>> diff --git a/sysdeps/x86_64/wmemset_chk.S b/sysdeps/x86_64/wmemset_chk.S
>> new file mode 100644
>> index 0000000..64c2774
>> --- /dev/null
>> +++ b/sysdeps/x86_64/wmemset_chk.S
>> @@ -0,0 +1,33 @@
>> +/* Checking wmemset for x86-64.
>> +   Copyright (C) 2004-2017 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#include <sysdep.h>
>> +#include "asm-syntax.h"
>> +
>> +#ifndef SHARED
>> +       /* For libc.so this is defined in wmemset.S.
>> +          For libc.a, this is a separate source to avoid
>> +          wmemset bringing in __chk_fail and all routines
>> +          it calls.  */
>> +        .text
>> +ENTRY (__wmemset_chk)
>> +       cmpq    %rdx, %rcx
>> +       jb      __chk_fail
>> +       jmp     wmemset
>> +END (__wmemset_chk)
>> +#endif
>> --
>> 2.9.4
>>
>
>
>
>-- 
>H.J.
  

Patch

diff --git a/include/wchar.h b/include/wchar.h
index e2579a1..a773d56 100644
--- a/include/wchar.h
+++ b/include/wchar.h
@@ -157,6 +157,9 @@  extern wchar_t *__wmemmove (wchar_t *__s1, const wchar_t *__s2,
 extern wchar_t *__wcschrnul (const wchar_t *__s, wchar_t __wc)
      __attribute_pure__;
 
+extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
+			       size_t __ns) __THROW;
+
 extern int __vfwscanf (__FILE *__restrict __s,
 		       const wchar_t *__restrict __format,
 		       __gnuc_va_list __arg)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 69ed509..4127878 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -26,13 +26,18 @@ 
 #define VMOVU		movdqu
 #define VMOVA		movdqa
 
-#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
   movq r, %rax; \
   punpcklbw %xmm0, %xmm0; \
   punpcklwd %xmm0, %xmm0; \
   pshufd $0, %xmm0, %xmm0
 
+#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movd d, %xmm0; \
+  movq r, %rax; \
+  pshufd $0, %xmm0, %xmm0
+
 #define SECTION(p)		p
 
 #ifndef MEMSET_SYMBOL
@@ -40,10 +45,21 @@ 
 # define MEMSET_SYMBOL(p,s)	memset
 #endif
 
+#ifndef WMEMSET_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s) p
+# define WMEMSET_SYMBOL(p,s)	__wmemset
+#endif
+
 #include "multiarch/memset-vec-unaligned-erms.S"
 
 libc_hidden_builtin_def (memset)
 
+#if IS_IN (libc)
+libc_hidden_def (__wmemset)
+weak_alias (__wmemset, wmemset)
+libc_hidden_weak (wmemset)
+#endif
+
 #if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
 strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
 	.section .gnu.warning.__memset_zero_constant_len_parameter
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 06d9a9d..a91d2f9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -300,6 +300,17 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/wmemset.S.  */
+  IFUNC_IMPL (i, name, wmemset,
+	      IFUNC_IMPL_ADD (array, i, wmemset, 1,
+			      __wmemset_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemset_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __wmemset_avx512_unaligned))
+
 #ifdef SHARED
   /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
   IFUNC_IMPL (i, name, __memcpy_chk,
@@ -417,6 +428,17 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
 			      __strncmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wmemset_chk.S.  */
+  IFUNC_IMPL (i, name, __wmemset_chk,
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
+			      __wmemset_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemset_chk_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __wmemset_chk_avx512_unaligned))
 #endif
 
   return i;
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 79975e0..7ab3d89 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -4,13 +4,19 @@ 
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
 
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
   movq r, %rax; \
   vpbroadcastb %xmm0, %ymm0
 
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastd %xmm0, %ymm0
+
 # define SECTION(p)		p##.avx
 # define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+# define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
 
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index a5ec349..0783979 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -4,14 +4,21 @@ 
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
   movq r, %rax; \
   vpbroadcastb %xmm0, %xmm0; \
   vpbroadcastq %xmm0, %zmm0
 
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastd %xmm0, %xmm0; \
+  vpbroadcastq %xmm0, %zmm0
+
 # define SECTION(p)		p##.avx512
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+# define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
 
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 704eed9..2eb9e37 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -30,6 +30,10 @@ 
 # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
 #endif
 
+#ifndef WMEMSET_CHK_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
+#endif
+
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER			vzeroupper
@@ -79,6 +83,21 @@  END (__bzero)
 weak_alias (__bzero, bzero)
 #endif
 
+#if IS_IN (libc)
+# if defined SHARED
+ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+# endif
+
+ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+	shlq	$2, %rdx
+	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	jmp	L(entry_from_bzero)
+END (WMEMSET_SYMBOL (__wmemset, unaligned))
+#endif
+
 #if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 	cmpq	%rdx, %rcx
@@ -87,8 +106,7 @@  END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 #endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
-L(memset_entry):
-	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 L(entry_from_bzero):
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
@@ -132,7 +150,7 @@  END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 # endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
-	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
 	cmpq	$(VEC_SIZE * 2), %rdx
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index 9d33118..11f2737 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -58,16 +58,23 @@  END(memset)
 
 #if IS_IN (libc)
 # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
+# define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
 
 # ifdef SHARED
-# undef libc_hidden_builtin_def
+#  undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal memset calls through a PLT.
    The speedup we get from using SSE2 instructions is likely eaten away
    by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_memset; __GI_memset = __memset_sse2_unaligned
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
+	.globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
+	.globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
 # endif
 
+# undef weak_alias
+# define weak_alias(original, alias) \
+	.weak bzero; bzero = __bzero
+
 # undef strong_alias
 # define strong_alias(original, alias)
 #endif
diff --git a/sysdeps/x86_64/multiarch/wmemset.S b/sysdeps/x86_64/multiarch/wmemset.S
new file mode 100644
index 0000000..3bd7ca2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemset.S
@@ -0,0 +1,47 @@ 
+/* Multiple versions of wmemset
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+ENTRY(__wmemset)
+	.type	__wmemset, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__wmemset_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	lea	__wmemset_avx2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__wmemset_avx512_unaligned(%rip), %RAX_LP
+1:	ret
+END(__wmemset)
+
+weak_alias (__wmemset, wmemset)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.S b/sysdeps/x86_64/multiarch/wmemset_chk.S
new file mode 100644
index 0000000..c76fcb1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemset_chk.S
@@ -0,0 +1,46 @@ 
+/* Multiple versions of wmemset_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# ifdef SHARED
+ENTRY(__wmemset_chk)
+	.type	__wmemset_chk, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__wmemset_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	lea	__wmemset_chk_avx2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__wmemset_chk_avx512_unaligned(%rip), %RAX_LP
+1:	ret
+END(__wmemset_chk)
+# else
+#  include "../wmemset_chk.S"
+# endif
+#endif
diff --git a/sysdeps/x86_64/wmemset.S b/sysdeps/x86_64/wmemset.S
new file mode 100644
index 0000000..f96d567
--- /dev/null
+++ b/sysdeps/x86_64/wmemset.S
@@ -0,0 +1 @@ 
+/* Implemented in memset.S.  */
diff --git a/sysdeps/x86_64/wmemset_chk.S b/sysdeps/x86_64/wmemset_chk.S
new file mode 100644
index 0000000..64c2774
--- /dev/null
+++ b/sysdeps/x86_64/wmemset_chk.S
@@ -0,0 +1,33 @@ 
+/* Checking wmemset for x86-64.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef SHARED
+	/* For libc.so this is defined in wmemset.S.
+	   For libc.a, this is a separate source to avoid
+	   wmemset bringing in __chk_fail and all routines
+	   it calls.  */
+        .text
+ENTRY (__wmemset_chk)
+	cmpq	%rdx, %rcx
+	jb	__chk_fail
+	jmp	wmemset
+END (__wmemset_chk)
+#endif