From patchwork Tue Jul 1 15:56:52 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "H.J. Lu" X-Patchwork-Id: 1839 Received: (qmail 30091 invoked by alias); 1 Jul 2014 15:57:09 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 30006 invoked by uid 89); 1 Jul 2014 15:57:01 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=2.8 required=5.0 tests=AWL, BAYES_00, FREEMAIL_FROM, RCVD_IN_DNSWL_LOW, SPAM_URI1, SPF_PASS autolearn=no version=3.3.2 X-HELO: mail-ob0-f181.google.com MIME-Version: 1.0 X-Received: by 10.60.179.138 with SMTP id dg10mr50584289oec.13.1404230214052; Tue, 01 Jul 2014 08:56:54 -0700 (PDT) Date: Tue, 1 Jul 2014 08:56:52 -0700 Message-ID: Subject: [PATCH] Enable AVX2 optimized memset only if -mavx2 works From: "H.J. Lu" To: myllynen@redhat.com Cc: Ling Ma , GNU C Library , Ondrej Bilka , Liubov Dmitrieva , Ling Ma On Tue, Jul 1, 2014 at 2:03 AM, Marko Myllynen wrote: > Hi, > > On 2014-04-04 10:34, ling.ma.program@gmail.com wrote: >> From: Ling Ma >> >> In this patch we manage to reduce miss branch prediction by >> avoid using branch instructions and force destination to be aligned >> with avx instruction. >> >> --- >> In this version we removed prefetch and append vmovd. >> >> ChangeLog | 9 ++ >> sysdeps/x86_64/multiarch/Makefile | 4 +- >> sysdeps/x86_64/multiarch/memset-avx2.S | 192 +++++++++++++++++++++++++++++++++ >> sysdeps/x86_64/multiarch/memset.S | 59 ++++++++++ >> sysdeps/x86_64/multiarch/memset_chk.S | 44 ++++++++ >> 5 files changed, 307 insertions(+), 1 deletion(-) >> create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S >> create mode 100644 sysdeps/x86_64/multiarch/memset.S >> create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S >> >> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S >> new file mode 100644 >> index 0000000..08e8ee8 >> --- /dev/null >> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S >> @@ -0,0 +1,192 @@ >> +/* memset with AVX2 >> + Copyright (C) 2014 Free Software Foundation, Inc. >> + Contributed by Alibaba Group. >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library; if not, see >> + . */ >> + >> +#include >> + >> +#if !defined NOT_IN_libc >> + >> +#include "asm-syntax.h" >> +#ifndef ALIGN >> +# define ALIGN(n) .p2align n >> +#endif >> +#ifndef MEMSET >> +# define MEMSET __memset_avx2 >> +# define MEMSET_CHK __memset_chk_avx2 >> +#endif >> + >> + .section .text.avx2,"ax",@progbits >> +#if defined PIC >> +ENTRY (MEMSET_CHK) >> + cmpq %rdx, %rcx >> + jb HIDDEN_JUMPTARGET (__chk_fail) >> +END (MEMSET_CHK) >> +#endif >> + >> +ENTRY (MEMSET) >> + vpxor %xmm0, %xmm0, %xmm0 >> + vmovd %esi, %xmm1 >> + lea (%rdi, %rdx), %r8 >> + vpshufb %xmm0, %xmm1, %xmm0 >> + mov %rdi, %rax >> + cmp $256, %rdx >> + jae L(256bytesormore) >> + vmovd %xmm0, %rcx >> + cmp $128, %rdx >> + jb L(less_128bytes) >> + vmovups %xmm0, (%rdi) >> + vmovups %xmm0, 0x10(%rdi) >> + vmovups %xmm0, 0x20(%rdi) >> + vmovups %xmm0, 0x30(%rdi) >> + vmovups %xmm0, 0x40(%rdi) >> + vmovups %xmm0, 0x50(%rdi) >> + vmovups %xmm0, 0x60(%rdi) >> + vmovups %xmm0, 0x70(%rdi) >> + vmovups %xmm0, -0x80(%r8) >> + vmovups %xmm0, -0x70(%r8) >> + vmovups %xmm0, -0x60(%r8) >> + vmovups %xmm0, -0x50(%r8) >> + vmovups %xmm0, -0x40(%r8) >> + vmovups %xmm0, -0x30(%r8) >> + vmovups %xmm0, -0x20(%r8) >> + vmovups %xmm0, -0x10(%r8) >> + ret >> + ALIGN(4) >> +L(less_128bytes): >> + cmp $64, %edx >> + jb L(less_64bytes) >> + vmovups %xmm0, (%rdi) >> + vmovups %xmm0, 0x10(%rdi) >> + vmovups %xmm0, 0x20(%rdi) >> + vmovups %xmm0, 0x30(%rdi) >> + vmovups %xmm0, -0x40(%r8) >> + vmovups %xmm0, -0x30(%r8) >> + vmovups %xmm0, -0x20(%r8) >> + vmovups %xmm0, -0x10(%r8) >> + ret >> + ALIGN(4) >> +L(less_64bytes): >> + cmp $32, %edx >> + jb L(less_32bytes) >> + vmovups %xmm0, (%rdi) >> + vmovups %xmm0, 0x10(%rdi) >> + vmovups %xmm0, -0x20(%r8) >> + vmovups %xmm0, -0x10(%r8) >> + ret >> + ALIGN(4) >> +L(less_32bytes): >> + cmp $16, %edx >> + jb L(less_16bytes) >> + vmovups %xmm0, (%rdi) >> + vmovups %xmm0, -0x10(%r8) >> + ret >> + ALIGN(4) >> +L(less_16bytes): >> + cmp $8, %edx >> + jb L(less_8bytes) >> + mov %rcx, (%rdi) >> + mov %rcx, -0x08(%r8) >> + ret >> + ALIGN(4) >> +L(less_8bytes): >> + cmp $4, %edx >> + jb L(less_4bytes) >> + mov %ecx, (%rdi) >> + mov %ecx, -0x04(%r8) >> + ALIGN(4) >> +L(less_4bytes): >> + cmp $2, %edx >> + jb L(less_2bytes) >> + mov %cx, (%rdi) >> + mov %cx, -0x02(%r8) >> + ret >> + ALIGN(4) >> +L(less_2bytes): >> + cmp $1, %edx >> + jb L(less_1bytes) >> + mov %cl, (%rdi) >> +L(less_1bytes): >> + ret >> + >> + ALIGN(4) >> +L(256bytesormore): >> + vinserti128 $1, %xmm0, %ymm0, %ymm0 > > this breaks build on RHEL 6 x86_64: > > ../sysdeps/x86_64/multiarch/memset-avx2.S: > ../sysdeps/x86_64/multiarch/memset-avx2.S: Assembler messages: > Assembler messages: > ../sysdeps/x86_64/multiarch/memset-avx2.S:132: > ../sysdeps/x86_64/multiarch/memset-avx2.S:132: Error: Error: no such > instruction: `vinserti128 $1,%xmm0,%ymm0,%ymm0'no such instruction: > `vinserti128 $1,%xmm0,%ymm0,%ymm0' > > Cheers, > This patches enables AVX2 optimized memset only if -mavx2 works. Tested with GCC 4.6 and 4.8 on Fedora 20/x86-64. OK to install? Thanks. H.J. --- 2014-07-01 H.J. Lu * config.h.in (HAVE_AVX2_SUPPORT): New #undef. * sysdeps/i386/configure.ac: Set HAVE_AVX2_SUPPORT and config-cflags-avx2. * sysdeps/x86_64/configure.ac: Likewise. * sysdeps/i386/configure: Regenerated. * sysdeps/x86_64/configure: Likewise. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memset-avx2 only if config-cflags-avx2 is yes. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Tests for memset_chk and memset only if HAVE_AVX2_SUPPORT is defined. * sysdeps/x86_64/multiarch/memset.S: Define multiple versions only if HAVE_AVX2_SUPPORT is defined. * sysdeps/x86_64/multiarch/memset_chk.S: Likewise. From 99d75c147abe8ba91f3ad7123126bda4e3f31045 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Tue, 1 Jul 2014 08:52:47 -0700 Subject: [PATCH] Enable AVX2 optimized memset only if -mavx2 works * config.h.in (HAVE_AVX2_SUPPORT): New #undef. * sysdeps/i386/configure.ac: Set HAVE_AVX2_SUPPORT and config-cflags-avx2. * sysdeps/x86_64/configure.ac: Likewise. * sysdeps/i386/configure: Regenerated. * sysdeps/x86_64/configure: Likewise. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memset-avx2 only if config-cflags-avx2 is yes. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Tests for memset_chk and memset only if HAVE_AVX2_SUPPORT is defined. * sysdeps/x86_64/multiarch/memset.S: Define multiple versions only if HAVE_AVX2_SUPPORT is defined. * sysdeps/x86_64/multiarch/memset_chk.S: Likewise. --- ChangeLog | 17 +++++++++++++++++ config.h.in | 3 +++ sysdeps/i386/configure | 26 ++++++++++++++++++++++++++ sysdeps/i386/configure.ac | 9 +++++++++ sysdeps/x86_64/configure | 26 ++++++++++++++++++++++++++ sysdeps/x86_64/configure.ac | 9 +++++++++ sysdeps/x86_64/multiarch/Makefile | 7 +++++-- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 2 ++ sysdeps/x86_64/multiarch/memset.S | 24 +++++++++++++----------- sysdeps/x86_64/multiarch/memset_chk.S | 2 +- 10 files changed, 111 insertions(+), 14 deletions(-) diff --git a/ChangeLog b/ChangeLog index a1e44b1..20abae7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +2014-07-01 H.J. Lu + + * config.h.in (HAVE_AVX2_SUPPORT): New #undef. + * sysdeps/i386/configure.ac: Set HAVE_AVX2_SUPPORT and + config-cflags-avx2. + * sysdeps/x86_64/configure.ac: Likewise. + * sysdeps/i386/configure: Regenerated. + * sysdeps/x86_64/configure: Likewise. + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + memset-avx2 only if config-cflags-avx2 is yes. + * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): + Tests for memset_chk and memset only if HAVE_AVX2_SUPPORT is + defined. + * sysdeps/x86_64/multiarch/memset.S: Define multiple versions + only if HAVE_AVX2_SUPPORT is defined. + * sysdeps/x86_64/multiarch/memset_chk.S: Likewise. + 2014-07-01 Stefan Liebler * sysdeps/s390/fpu/libm-test-ulps: Regenerate. diff --git a/config.h.in b/config.h.in index 2dcd135..97b5571 100644 --- a/config.h.in +++ b/config.h.in @@ -103,6 +103,9 @@ /* Define if gcc supports FMA4. */ #undef HAVE_FMA4_SUPPORT +/* Define if gcc supports AVX2. */ +#undef HAVE_AVX2_SUPPORT + /* Define if the compiler\'s exception support is based on libunwind. */ #undef HAVE_CC_WITH_LIBUNWIND diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure index f0a20e3..6e89b59 100644 --- a/sysdeps/i386/configure +++ b/sysdeps/i386/configure @@ -240,6 +240,32 @@ $as_echo "$libc_cv_cc_novzeroupper" >&6; } config_vars="$config_vars config-cflags-novzeroupper = $libc_cv_cc_novzeroupper" +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 support" >&5 +$as_echo_n "checking for AVX2 support... " >&6; } +if ${libc_cv_cc_avx2+:} false; then : + $as_echo_n "(cached) " >&6 +else + if { ac_try='${CC-cc} -mavx2 -xc /dev/null -S -o /dev/null' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then : + libc_cv_cc_avx2=yes +else + libc_cv_cc_avx2=no +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_avx2" >&5 +$as_echo "$libc_cv_cc_avx2" >&6; } +if test $libc_cv_cc_avx2 = yes; then + $as_echo "#define HAVE_AVX2_SUPPORT 1" >>confdefs.h + +fi +config_vars="$config_vars +config-cflags-avx2 = $libc_cv_cc_avx2" + $as_echo "#define USE_REGPARMS 1" >>confdefs.h diff --git a/sysdeps/i386/configure.ac b/sysdeps/i386/configure.ac index dfe0b47..35c4522 100644 --- a/sysdeps/i386/configure.ac +++ b/sysdeps/i386/configure.ac @@ -88,6 +88,15 @@ LIBC_TRY_CC_OPTION([-mno-vzeroupper], ]) LIBC_CONFIG_VAR([config-cflags-novzeroupper], [$libc_cv_cc_novzeroupper]) +dnl Check if -mavx2 works. +AC_CACHE_CHECK(for AVX2 support, libc_cv_cc_avx2, [dnl +LIBC_TRY_CC_OPTION([-mavx2], [libc_cv_cc_avx2=yes], [libc_cv_cc_avx2=no]) +]) +if test $libc_cv_cc_avx2 = yes; then + AC_DEFINE(HAVE_AVX2_SUPPORT) +fi +LIBC_CONFIG_VAR([config-cflags-avx2], [$libc_cv_cc_avx2]) + AC_DEFINE(USE_REGPARMS) dnl It is always possible to access static and hidden symbols in an diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure index 45d868d..7d4dadd 100644 --- a/sysdeps/x86_64/configure +++ b/sysdeps/x86_64/configure @@ -249,6 +249,32 @@ if test $libc_cv_asm_mpx == yes; then fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 support" >&5 +$as_echo_n "checking for AVX2 support... " >&6; } +if ${libc_cv_cc_avx2+:} false; then : + $as_echo_n "(cached) " >&6 +else + if { ac_try='${CC-cc} -mavx2 -xc /dev/null -S -o /dev/null' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then : + libc_cv_cc_avx2=yes +else + libc_cv_cc_avx2=no +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_avx2" >&5 +$as_echo "$libc_cv_cc_avx2" >&6; } +if test $libc_cv_cc_avx2 = yes; then + $as_echo "#define HAVE_AVX2_SUPPORT 1" >>confdefs.h + +fi +config_vars="$config_vars +config-cflags-avx2 = $libc_cv_cc_avx2" + $as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h # work around problem with autoconf and empty lines at the end of files diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac index 9138f63..c9f9a51 100644 --- a/sysdeps/x86_64/configure.ac +++ b/sysdeps/x86_64/configure.ac @@ -90,6 +90,15 @@ if test $libc_cv_asm_mpx == yes; then AC_DEFINE(HAVE_MPX_SUPPORT) fi +dnl Check if -mavx2 works. +AC_CACHE_CHECK(for AVX2 support, libc_cv_cc_avx2, [dnl +LIBC_TRY_CC_OPTION([-mavx2], [libc_cv_cc_avx2=yes], [libc_cv_cc_avx2=no]) +]) +if test $libc_cv_cc_avx2 = yes; then + AC_DEFINE(HAVE_AVX2_SUPPORT) +fi +LIBC_CONFIG_VAR([config-cflags-avx2], [$libc_cv_cc_avx2]) + dnl It is always possible to access static and hidden symbols in an dnl position independent way. AC_DEFINE(PI_STATIC_AND_HIDDEN) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 42df96f..3bb9702 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -17,8 +17,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ - strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ - memset-avx2 + strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c varshift @@ -27,6 +26,10 @@ CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4 endif + +ifeq (yes,$(config-cflags-avx2)) +sysdep_routines += memset-avx2 +endif endif ifeq ($(subdir),wcsmbs) diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index f1593c5..7e93e59 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -61,6 +61,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memmove_ssse3) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2)) +#ifdef HAVE_AVX2_SUPPORT /* Support sysdeps/x86_64/multiarch/memset_chk.S. */ IFUNC_IMPL (i, name, __memset_chk, IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2) @@ -71,6 +72,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, memset, IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2) IFUNC_IMPL_ADD (array, i, memset, HAS_AVX2, __memset_avx2)) +#endif /* Support sysdeps/x86_64/multiarch/stpncpy.S. */ IFUNC_IMPL (i, name, stpncpy, diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S index 3113d1c..00d46d1 100644 --- a/sysdeps/x86_64/multiarch/memset.S +++ b/sysdeps/x86_64/multiarch/memset.S @@ -17,12 +17,13 @@ License along with the GNU C Library; if not, see . */ +#ifdef HAVE_AVX2_SUPPORT #include #include #include /* Define multiple versions only for the definition in lib. */ -#ifndef NOT_IN_libc +# ifndef NOT_IN_libc ENTRY(memset) .type memset, @gnu_indirect_function cmpl $0, __cpu_features+KIND_OFFSET(%rip) @@ -34,26 +35,27 @@ ENTRY(memset) leaq __memset_avx2(%rip), %rax 2: ret END(memset) -#endif +# endif -#if !defined NOT_IN_libc -# undef memset -# define memset __memset_sse2 +# if !defined NOT_IN_libc +# undef memset +# define memset __memset_sse2 -# undef __memset_chk -# define __memset_chk __memset_chk_sse2 +# undef __memset_chk +# define __memset_chk __memset_chk_sse2 -# ifdef SHARED +# ifdef SHARED # undef libc_hidden_builtin_def /* It doesn't make sense to send libc-internal memset calls through a PLT. The speedup we get from using GPR instruction is likely eaten away by the indirect call in the PLT. */ # define libc_hidden_builtin_def(name) \ .globl __GI_memset; __GI_memset = __memset_sse2 -# endif +# endif -# undef strong_alias -# define strong_alias(original, alias) +# undef strong_alias +# define strong_alias(original, alias) +# endif #endif #include "../memset.S" diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S index 2182780..8a607bd 100644 --- a/sysdeps/x86_64/multiarch/memset_chk.S +++ b/sysdeps/x86_64/multiarch/memset_chk.S @@ -22,7 +22,7 @@ /* Define multiple versions only for the definition in lib. */ #ifndef NOT_IN_libc -# ifdef SHARED +# if defined SHARED && defined HAVE_AVX2_SUPPORT ENTRY(__memset_chk) .type __memset_chk, @gnu_indirect_function cmpl $0, __cpu_features+KIND_OFFSET(%rip) -- 1.9.3