From patchwork Fri Mar 18 20:25:35 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Lu, Hongjiu" X-Patchwork-Id: 11414 Received: (qmail 129567 invoked by alias); 18 Mar 2016 20:25:39 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 129552 invoked by uid 89); 18 Mar 2016 20:25:38 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=1.1 required=5.0 tests=AWL, BAYES_50, KAM_LAZY_DOMAIN_SECURITY, NO_DNS_FOR_FROM, RP_MATCHES_RCVD autolearn=no version=3.3.2 spammy=spells, avx2, avx2_usable, HAS_CPU_FEATURE X-HELO: mga14.intel.com X-ExtLoop1: 1 Date: Fri, 18 Mar 2016 13:25:35 -0700 From: "H.J. Lu" To: GNU C Library Subject: [PATCH] Set index_arch_AVX_Fast_Unaligned_Load only for Intel processors Message-ID: <20160318202535.GA13420@intel.com> Reply-To: "H.J. Lu" MIME-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.5.24 (2015-08-30) Since only Intel processors with AVX2 have fast unaligned load, we should set index_arch_AVX_Fast_Unaligned_Load only for Intel processors. Add PTR_HAS_CPU_FEATURE and PTR_HAS_ARCH_FEATURE to aoid loading GLRO(dl_x86_cpu_features) in cpu-features.c. Tested on x86-64. OK for master? H.J. --- * sysdeps/x86/cpu-features.c (get_common_indeces): Remove inline. Set AVX, AVX2, AVX512, FMA and FMA4 usable bits here. processors. (init_cpu_features): Replace HAS_CPU_FEATURE and PTR_HAS_ARCH_FEATURE with PTR_HAS_CPU_FEATURE and PTR_HAS_ARCH_FEATURE. Set index_arch_AVX_Fast_Unaligned_Load for Intel processors with usable AVX2. * sysdeps/x86/cpu-features.h (PTR_HAS_CPU_FEATURE): New macro. (PTR_HAS_ARCH_FEATURE): Likewise. (HAS_CPU_FEATURE): Use PTR_HAS_CPU_FEATURE. (HAS_ARCH_FEATURE): Use PTR_HAS_ARCH_FEATURE. --- sysdeps/x86/cpu-features.c | 124 +++++++++++++++++++++++---------------------- sysdeps/x86/cpu-features.h | 10 +++- 2 files changed, 71 insertions(+), 63 deletions(-) diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 1787716..96949ae 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -19,7 +19,7 @@ #include #include -static inline void +static void get_common_indeces (struct cpu_features *cpu_features, unsigned int *family, unsigned int *model, unsigned int *extended_model) @@ -28,7 +28,7 @@ get_common_indeces (struct cpu_features *cpu_features, __cpuid (1, eax, cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx, cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx, cpu_features->cpuid[COMMON_CPUID_INDEX_1].edx); - GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].eax = eax; + cpu_features->cpuid[COMMON_CPUID_INDEX_1].eax = eax; *family = (eax >> 8) & 0x0f; *model = (eax >> 4) & 0x0f; *extended_model = (eax >> 12) & 0xf0; @@ -37,6 +37,59 @@ get_common_indeces (struct cpu_features *cpu_features, *family += (eax >> 20) & 0xff; *model += *extended_model; } + + if (cpu_features->max_cpuid >= 7) + __cpuid_count (7, 0, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].eax, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].ebx, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].ecx, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].edx); + + /* Can we call xgetbv? */ + if (PTR_HAS_CPU_FEATURE (cpu_features, OSXSAVE)) + { + unsigned int xcrlow; + unsigned int xcrhigh; + asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0)); + /* Is YMM and XMM state usable? */ + if ((xcrlow & (bit_YMM_state | bit_XMM_state)) == + (bit_YMM_state | bit_XMM_state)) + { + /* Determine if AVX is usable. */ + if (PTR_HAS_CPU_FEATURE (cpu_features, AVX)) + cpu_features->feature[index_arch_AVX_Usable] + |= bit_arch_AVX_Usable; + /* Determine if AVX2 is usable. */ + if (PTR_HAS_CPU_FEATURE (cpu_features, AVX2)) + cpu_features->feature[index_arch_AVX2_Usable] + |= bit_arch_AVX2_Usable; + /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and + ZMM16-ZMM31 state are enabled. */ + if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state + | bit_ZMM16_31_state)) == + (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state)) + { + /* Determine if AVX512F is usable. */ + if (PTR_HAS_CPU_FEATURE (cpu_features, AVX512F)) + { + cpu_features->feature[index_arch_AVX512F_Usable] + |= bit_arch_AVX512F_Usable; + /* Determine if AVX512DQ is usable. */ + if (PTR_HAS_CPU_FEATURE (cpu_features, AVX512DQ)) + cpu_features->feature[index_arch_AVX512DQ_Usable] + |= bit_arch_AVX512DQ_Usable; + } + } + /* Determine if FMA is usable. */ + if (PTR_HAS_CPU_FEATURE (cpu_features, FMA)) + cpu_features->feature[index_arch_FMA_Usable] + |= bit_arch_FMA_Usable; + /* Determine if FMA4 is usable. */ + if (PTR_HAS_CPU_FEATURE (cpu_features, FMA4)) + cpu_features->feature[index_arch_FMA4_Usable] + |= bit_arch_FMA4_Usable; + } + } } static inline void @@ -135,6 +188,12 @@ init_cpu_features (struct cpu_features *cpu_features) break; } } + + /* Unaligned load with 256-bit AVX registers are faster on + Intel processors with AVX2. */ + if (PTR_HAS_ARCH_FEATURE (cpu_features, AVX2_Usable)) + cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load] + |= bit_arch_AVX_Fast_Unaligned_Load; } /* This spells out "AuthenticAMD". */ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) @@ -168,70 +227,13 @@ init_cpu_features (struct cpu_features *cpu_features) kind = arch_kind_other; /* Support i586 if CX8 is available. */ - if (HAS_CPU_FEATURE (CX8)) + if (PTR_HAS_CPU_FEATURE (cpu_features, CX8)) cpu_features->feature[index_arch_I586] |= bit_arch_I586; /* Support i686 if CMOV is available. */ - if (HAS_CPU_FEATURE (CMOV)) + if (PTR_HAS_CPU_FEATURE (cpu_features, CMOV)) cpu_features->feature[index_arch_I686] |= bit_arch_I686; - if (cpu_features->max_cpuid >= 7) - __cpuid_count (7, 0, - cpu_features->cpuid[COMMON_CPUID_INDEX_7].eax, - cpu_features->cpuid[COMMON_CPUID_INDEX_7].ebx, - cpu_features->cpuid[COMMON_CPUID_INDEX_7].ecx, - cpu_features->cpuid[COMMON_CPUID_INDEX_7].edx); - - /* Can we call xgetbv? */ - if (HAS_CPU_FEATURE (OSXSAVE)) - { - unsigned int xcrlow; - unsigned int xcrhigh; - asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0)); - /* Is YMM and XMM state usable? */ - if ((xcrlow & (bit_YMM_state | bit_XMM_state)) == - (bit_YMM_state | bit_XMM_state)) - { - /* Determine if AVX is usable. */ - if (HAS_CPU_FEATURE (AVX)) - cpu_features->feature[index_arch_AVX_Usable] - |= bit_arch_AVX_Usable; -#if index_arch_AVX2_Usable != index_arch_AVX_Fast_Unaligned_Load -# error index_arch_AVX2_Usable != index_arch_AVX_Fast_Unaligned_Load -#endif - /* Determine if AVX2 is usable. Unaligned load with 256-bit - AVX registers are faster on processors with AVX2. */ - if (HAS_CPU_FEATURE (AVX2)) - cpu_features->feature[index_arch_AVX2_Usable] - |= bit_arch_AVX2_Usable | bit_arch_AVX_Fast_Unaligned_Load; - /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and - ZMM16-ZMM31 state are enabled. */ - if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state - | bit_ZMM16_31_state)) == - (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state)) - { - /* Determine if AVX512F is usable. */ - if (HAS_CPU_FEATURE (AVX512F)) - { - cpu_features->feature[index_arch_AVX512F_Usable] - |= bit_arch_AVX512F_Usable; - /* Determine if AVX512DQ is usable. */ - if (HAS_CPU_FEATURE (AVX512DQ)) - cpu_features->feature[index_arch_AVX512DQ_Usable] - |= bit_arch_AVX512DQ_Usable; - } - } - /* Determine if FMA is usable. */ - if (HAS_CPU_FEATURE (FMA)) - cpu_features->feature[index_arch_FMA_Usable] - |= bit_arch_FMA_Usable; - /* Determine if FMA4 is usable. */ - if (HAS_CPU_FEATURE (FMA4)) - cpu_features->feature[index_arch_FMA4_Usable] - |= bit_arch_FMA4_Usable; - } - } - #if !HAS_CPUID no_cpuid: #endif diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h index 0624a92..92ff8de 100644 --- a/sysdeps/x86/cpu-features.h +++ b/sysdeps/x86/cpu-features.h @@ -204,11 +204,17 @@ extern const struct cpu_features *__get_cpu_features (void) # endif +/* Only used directly in cpu-features.c. */ +# define PTR_HAS_CPU_FEATURE(ptr, name) \ + ((ptr->cpuid[index_cpu_##name].reg_##name & (bit_cpu_##name)) != 0) +# define PTR_HAS_ARCH_FEATURE(ptr, name) \ + ((ptr->feature[index_arch_##name] & (bit_arch_##name)) != 0) + /* HAS_* evaluates to true if we may use the feature at runtime. */ # define HAS_CPU_FEATURE(name) \ - ((__get_cpu_features ()->cpuid[index_cpu_##name].reg_##name & (bit_cpu_##name)) != 0) + PTR_HAS_CPU_FEATURE (__get_cpu_features (), name) # define HAS_ARCH_FEATURE(name) \ - ((__get_cpu_features ()->feature[index_arch_##name] & (bit_arch_##name)) != 0) + PTR_HAS_ARCH_FEATURE (__get_cpu_features (), name) # define index_cpu_CX8 COMMON_CPUID_INDEX_1 # define index_cpu_CMOV COMMON_CPUID_INDEX_1