@@ -319,6 +319,14 @@ get_amd_cpu (struct __processor_model *cpu_model,
CHECK___builtin_cpu_is ("znver5");
cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5;
}
+ else if ((model >= 0x50 && model <= 0x5f) ||
+ (model >= 0x80 && model <= 0xcf) ||
+ (model >= 0xd8 && model <= 0xe7))
+ {
+ cpu = "znver6";
+ CHECK___builtin_cpu_is ("znver6");
+ cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER6;
+ }
else if (has_cpu_feature (cpu_model, cpu_features2,
FEATURE_AVX512VP2INTERSECT))
{
@@ -326,6 +334,13 @@ get_amd_cpu (struct __processor_model *cpu_model,
CHECK___builtin_cpu_is ("znver5");
cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER5;
}
+ else if (has_cpu_feature (cpu_model, cpu_features2,
+ FEATURE_AVX512BMM))
+ {
+ cpu = "znver6";
+ CHECK___builtin_cpu_is ("znver6");
+ cpu_model->__cpu_subtype = AMDFAM1AH_ZNVER6;
+ }
break;
default:
break;
@@ -1049,6 +1064,16 @@ get_available_features (struct __processor_model *cpu_model,
}
}
+ /* Get Advanced Features at level 0x21 (eax = 0x21). */
+ if (max_cpuid_level >= 0x21)
+ {
+ __cpuid (0x21, eax, ebx, ecx, edx);
+ if (eax & bit_AVX512BMM)
+ {
+ set_feature (FEATURE_AVX512BMM);
+ }
+ }
+
/* Get Advanced Features at level 0x24 (eax = 0x24, ecx = 0). */
if (avx10_set && max_cpuid_level >= 0x24)
{
@@ -87,6 +87,7 @@ along with GCC; see the file COPYING3. If not see
#define OPTION_MASK_ISA_AVX512BITALG_SET \
(OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512BW_SET)
#define OPTION_MASK_ISA2_AVX512BF16_SET OPTION_MASK_ISA2_AVX512BF16
+#define OPTION_MASK_ISA2_AVX512BMM_SET OPTION_MASK_ISA2_AVX512BMM
#define OPTION_MASK_ISA_RTM_SET OPTION_MASK_ISA_RTM
#define OPTION_MASK_ISA_PRFCHW_SET OPTION_MASK_ISA_PRFCHW
#define OPTION_MASK_ISA_RDSEED_SET OPTION_MASK_ISA_RDSEED
@@ -272,6 +273,7 @@ along with GCC; see the file COPYING3. If not see
#define OPTION_MASK_ISA_AVX512VPOPCNTDQ_UNSET OPTION_MASK_ISA_AVX512VPOPCNTDQ
#define OPTION_MASK_ISA_AVX512BITALG_UNSET OPTION_MASK_ISA_AVX512BITALG
#define OPTION_MASK_ISA2_AVX512BF16_UNSET OPTION_MASK_ISA2_AVX512BF16
+#define OPTION_MASK_ISA2_AVX512BMM_UNSET OPTION_MASK_ISA2_AVX512BMM
#define OPTION_MASK_ISA_RTM_UNSET OPTION_MASK_ISA_RTM
#define OPTION_MASK_ISA_PRFCHW_UNSET OPTION_MASK_ISA_PRFCHW
#define OPTION_MASK_ISA_RDSEED_UNSET OPTION_MASK_ISA_RDSEED
@@ -393,7 +395,8 @@ along with GCC; see the file COPYING3. If not see
#define OPTION_MASK_ISA2_AVX512BW_UNSET \
(OPTION_MASK_ISA2_AVX512BF16_UNSET \
- | OPTION_MASK_ISA2_AVX512FP16_UNSET)
+ | OPTION_MASK_ISA2_AVX512FP16_UNSET \
+ | OPTION_MASK_ISA2_AVX512BMM_UNSET)
/* Set 1 << value as value of -malign-FLAG option. */
@@ -938,6 +941,21 @@ ix86_handle_option (struct gcc_options *opts,
}
return true;
+ case OPT_mavx512bmm:
+ if (value)
+ {
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AVX512BMM_SET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AVX512BMM_SET;
+ opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW_SET;
+ opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512BW_SET;
+ }
+ else
+ {
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AVX512BMM_UNSET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AVX512BMM_UNSET;
+ }
+ return true;
+
case OPT_mavxvnni:
if (value)
{
@@ -2151,7 +2169,8 @@ const char *const processor_names[] =
"znver2",
"znver3",
"znver4",
- "znver5"
+ "znver5",
+ "znver6"
};
/* Guarantee that the array is aligned with enum processor_type. */
@@ -2410,6 +2429,9 @@ const pta processor_alias_table[] =
{"znver5", PROCESSOR_ZNVER5, CPU_ZNVER5,
PTA_ZNVER5,
M_CPU_SUBTYPE (AMDFAM1AH_ZNVER5), P_PROC_AVX512F},
+ {"znver6", PROCESSOR_ZNVER6, CPU_ZNVER6,
+ PTA_ZNVER6,
+ M_CPU_SUBTYPE (AMDFAM1AH_ZNVER6), P_PROC_AVX512F},
{"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
PTA_BTVER1,
M_CPU_TYPE (AMD_BTVER1), P_PROC_SSE4_A},
@@ -104,6 +104,7 @@ enum processor_subtypes
INTEL_COREI7_PANTHERLAKE,
ZHAOXIN_FAM7H_YONGFENG,
AMDFAM1AH_ZNVER5,
+ AMDFAM1AH_ZNVER6,
ZHAOXIN_FAM7H_SHIJIDADAO,
INTEL_COREI7_DIAMONDRAPIDS,
INTEL_COREI7_NOVALAKE,
@@ -268,6 +269,7 @@ enum processor_features
FEATURE_USER_MSR,
FEATURE_AVX10_1 = 114,
FEATURE_AVX10_2 = 116,
+ FEATURE_AVX512BMM,
FEATURE_AMX_AVX512,
FEATURE_AMX_TF32,
FEATURE_AMX_FP8 = 120,
@@ -185,6 +185,7 @@ ISA_NAMES_TABLE_START
ISA_NAMES_TABLE_ENTRY("usermsr", FEATURE_USER_MSR, P_NONE, "-musermsr")
ISA_NAMES_TABLE_ENTRY("avx10.1", FEATURE_AVX10_1, P_AVX10_1, "-mavx10.1")
ISA_NAMES_TABLE_ENTRY("avx10.2", FEATURE_AVX10_2, P_NONE, "-mavx10.2")
+ ISA_NAMES_TABLE_ENTRY("avx512bmm", FEATURE_AVX512BMM, P_NONE, "-mavx512bmm")
ISA_NAMES_TABLE_ENTRY("amx-avx512", FEATURE_AMX_AVX512, P_NONE,
"-mamx-avx512")
ISA_NAMES_TABLE_ENTRY("amx-tf32", FEATURE_AMX_TF32, P_NONE, "-mamx-tf32")
@@ -444,8 +444,8 @@ i[34567]86-*-* | x86_64-*-*)
avx512vbmiintrin.h avx512vbmivlintrin.h
avx512vpopcntdqintrin.h clwbintrin.h mwaitxintrin.h
clzerointrin.h pkuintrin.h sgxintrin.h cetintrin.h
- gfniintrin.h cet.h avx512vbmi2intrin.h
- avx512vbmi2vlintrin.h avx512vnniintrin.h
+ gfniintrin.h cet.h avx512vbmi2intrin.h avx512bmmintrin.h
+ avx512bmmvlintrin.h avx512vbmi2vlintrin.h avx512vnniintrin.h
avx512vnnivlintrin.h vaesintrin.h vpclmulqdqintrin.h
avx512vpopcntdqvlintrin.h avx512bitalgintrin.h
avx512bitalgvlintrin.h pconfigintrin.h wbnoinvdintrin.h
@@ -722,7 +722,7 @@ c7 esther"
# 64-bit x86 processors supported by --with-arch=. Each processor
# MUST be separated by exactly one space.
x86_64_archs="amdfam10 athlon64 athlon64-sse3 barcelona bdver1 bdver2 \
-bdver3 bdver4 znver1 znver2 znver3 znver4 znver5 btver1 btver2 k8 k8-sse3 \
+bdver3 bdver4 znver1 znver2 znver3 znver4 znver5 znver6 btver1 btver2 k8 k8-sse3 \
opteron opteron-sse3 nocona core2 corei7 corei7-avx core-avx-i core-avx2 \
atom slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \
silvermont skylake-avx512 cannonlake icelake-client icelake-server \
@@ -3829,6 +3829,10 @@ case ${target} in
arch=znver5
cpu=znver5
;;
+ znver6-*)
+ arch=znver6
+ cpu=znver6
+ ;;
bdver4-*)
arch=bdver4
cpu=bdver4
@@ -3974,6 +3978,10 @@ case ${target} in
arch=znver5
cpu=znver5
;;
+ znver6-*)
+ arch=znver6
+ cpu=znver6
+ ;;
bdver4-*)
arch=bdver4
cpu=bdver4
new file mode 100644
@@ -0,0 +1,105 @@
+/* Copyright (C) 2025 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx512bmmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512BMMINTRIN_H_INCLUDED
+#define _AVX512BMMINTRIN_H_INCLUDED
+
+#ifndef __AVX512BMM__
+#pragma GCC push_options
+#pragma GCC target("avx512bmm")
+#define __DISABLE_AVX512BMM__
+#endif /* __AVX512BMM__ */
+
+#define _mm512_undefined_epi8 _mm512_undefined_epi32
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_epi8 (void)
+{
+ return __extension__ (__m512i)(__v64qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_vbmacor16x16x16_epi16 (__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vbmacor16x16x16_v32hi ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_vbmacxor16x16x16_epi16 (__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vbmacxor16x16x16_v32hi ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __C);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_vbitrevb_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vbitrevb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_vbitrevb_epi8 (__mmask64 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vbitrevb512_mask ((__v64qi) __A,
+ (__v64qi)
+ _mm512_setzero_epi8 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_vbitrevb_epi8 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_vbitrevb512_mask ((__v64qi) __A,
+ (__v64qi)
+ _mm512_undefined_epi8 (),
+ (__mmask64) -1);
+}
+
+#ifdef __DISABLE_AVX512BMM__
+#undef __DISABLE_AVX512BMM__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BMM__ */
+
+#endif /* _AVX512BMMINTRIN_H_INCLUDED */
new file mode 100644
@@ -0,0 +1,140 @@
+/* Copyright (C) 2025 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx512bmmvlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512BMMVLINTRIN_H_INCLUDED
+#define _AVX512BMMVLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512BMM__)
+#pragma GCC push_options
+#pragma GCC target("avx512bmm,avx512vl")
+#define __DISABLE_AVX512BMMVL__
+#endif /* __AVX512BMM__ */
+
+#define _mm128_undefined_epi8 _mm_avx512_undefined_si128
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm128_setzero_epi8 (void)
+{
+ return __extension__ (__m128i)(__v16qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+#define _mm256_undefined_epi8 _mm256_avx512_undefined_si256
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_epi8 (void)
+{
+ return __extension__ (__m256i)(__v32qi) { 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_vbmacor16x16x16_epi16 (__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_vbmacor16x16x16_v16hi ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_vbmacxor16x16x16_epi16 (__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_vbmacxor16x16x16_v16hi ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __C);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm128_mask_vbitrevb_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vbitrevb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm128_maskz_vbitrevb_epi8 (__mmask16 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vbitrevb128_mask ((__v16qi) __A,
+ (__v16qi)
+ _mm128_setzero_epi8 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm128_vbitrevb_epi8 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vbitrevb128_mask ((__v16qi) __A,
+ (__v16qi)
+ _mm128_undefined_epi8 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_vbitrevb_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vbitrevb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_vbitrevb_epi8 (__mmask32 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vbitrevb256_mask ((__v32qi) __A,
+ (__v32qi)
+ _mm256_setzero_epi8 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_vbitrevb_epi8 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vbitrevb256_mask ((__v32qi) __A,
+ (__v32qi)
+ _mm256_undefined_epi8 (),
+ (__mmask32) -1);
+}
+
+#ifdef __DISABLE_AVX512BMMVL__
+#undef __DISABLE_AVX512BMMVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BMMVL__ */
+
+#endif /* _AVX512BMMVLINTRIN_H_INCLUDED */
@@ -167,6 +167,9 @@
#define bit_AESKLE ( 1<<0 )
#define bit_WIDEKL ( 1<<2 )
+/* Sub leaf (%eax == 0x21) */
+#define bit_AVX512BMM ( 1<<23 )
+
/* AMX sub leaf (%eax == 0x1e, %ecx == 1) */
/* %eax */
#define bit_AMX_FP8 (1 << 4)
@@ -466,6 +466,8 @@ const char *host_detect_local_cpu (int argc, const char **argv)
processor = PROCESSOR_GEODE;
else if (has_feature (FEATURE_MOVBE) && family == 22)
processor = PROCESSOR_BTVER2;
+ else if (has_feature (FEATURE_AVX512BMM))
+ processor = PROCESSOR_ZNVER6;
else if (has_feature (FEATURE_AVX512VP2INTERSECT))
processor = PROCESSOR_ZNVER5;
else if (has_feature (FEATURE_AVX512F))
@@ -830,6 +832,9 @@ const char *host_detect_local_cpu (int argc, const char **argv)
case PROCESSOR_ZNVER5:
cpu = "znver5";
break;
+ case PROCESSOR_ZNVER6:
+ cpu = "znver6";
+ break;
case PROCESSOR_BTVER1:
cpu = "btver1";
break;
@@ -1398,6 +1398,11 @@ DEF_FUNCTION_TYPE (V8SF, PCV16HF)
DEF_FUNCTION_TYPE (V4SF, PCV8BF)
DEF_FUNCTION_TYPE (V8SF, PCV16BF)
+# AVX512BMM builtins
+DEF_FUNCTION_TYPE (V16QI, V16QI, UHI)
+DEF_FUNCTION_TYPE (V32QI, V32QI, USI)
+DEF_FUNCTION_TYPE (V64QI, V64QI, UDI)
+
# CMPccXADD builtins
DEF_FUNCTION_TYPE (INT, PINT, INT, INT, INT)
DEF_FUNCTION_TYPE (LONGLONG, PLONGLONG, LONGLONG, LONGLONG, INT)
@@ -2881,6 +2881,14 @@ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_d
BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPBF16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_extendbfsf2_1, "__builtin_ia32_cvtbf2sf", IX86_BUILTIN_CVTBF2SF, UNKNOWN, (int) FLOAT_FTYPE_BFLOAT16)
+/* AVX512BMM. */
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbmacor16x16x16_v16hi, "__builtin_ia32_vbmacor16x16x16_v16hi", IX86_BUILTIN_VBMACORV16HI, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbmacor16x16x16_v32hi, "__builtin_ia32_vbmacor16x16x16_v32hi", IX86_BUILTIN_VBMACORV32HI, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbmacxor16x16x16_v16hi, "__builtin_ia32_vbmacxor16x16x16_v16hi", IX86_BUILTIN_VBMACXORV16HI, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbmacxor16x16x16_v32hi, "__builtin_ia32_vbmacxor16x16x16_v32hi", IX86_BUILTIN_VBMACXORV32HI, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbitrevb_v16qi_mask, "__builtin_ia32_vbitrevb128_mask", IX86_BUILTIN_VBITREV16_MASK, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_UHI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbitrevb_v32qi_mask, "__builtin_ia32_vbitrevb256_mask", IX86_BUILTIN_VBITREV32_MASK, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512BMM, CODE_FOR_avx512bmm_vbitrevb_v64qi_mask, "__builtin_ia32_vbitrevb512_mask", IX86_BUILTIN_VBITREV64_MASK, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI_UDI)
/* AVX512FP16. */
BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
@@ -140,6 +140,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__znver5");
def_or_undef (parse_in, "__znver5__");
break;
+ case PROCESSOR_ZNVER6:
+ def_or_undef (parse_in, "__znver6");
+ def_or_undef (parse_in, "__znver6__");
+ break;
case PROCESSOR_BTVER1:
def_or_undef (parse_in, "__btver1");
def_or_undef (parse_in, "__btver1__");
@@ -386,6 +390,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
case PROCESSOR_ZNVER5:
def_or_undef (parse_in, "__tune_znver5__");
break;
+ case PROCESSOR_ZNVER6:
+ def_or_undef (parse_in, "__tune_znver6__");
+ break;
case PROCESSOR_BTVER1:
def_or_undef (parse_in, "__tune_btver1__");
break;
@@ -537,6 +544,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
;
}
+ if (isa_flag2 & OPTION_MASK_ISA2_AVX512BMM)
+ def_or_undef (parse_in, "__AVX512BMM__");
if (isa_flag2 & OPTION_MASK_ISA2_WBNOINVD)
def_or_undef (parse_in, "__WBNOINVD__");
if (isa_flag2 & OPTION_MASK_ISA2_AVX512VP2INTERSECT)
@@ -12623,6 +12623,11 @@ ix86_expand_args_builtin (const struct builtin_description *d,
nargs = 2;
nargs_constant = 1;
break;
+ case V16QI_FTYPE_V16QI_UHI:
+ case V32QI_FTYPE_V32QI_USI:
+ case V64QI_FTYPE_V64QI_UDI:
+ nargs = 2;
+ break;
case V16QI_FTYPE_V16QI_V16QI_V16QI:
case V8SF_FTYPE_V8SF_V8SF_V8SF:
case V4DF_FTYPE_V4DF_V4DF_V4DF:
@@ -120,6 +120,7 @@ DEF_PTA(APX_F)
DEF_PTA(USER_MSR)
DEF_PTA(AVX10_1)
DEF_PTA(AVX10_2)
+DEF_PTA(AVX512BMM)
DEF_PTA(AMX_AVX512)
DEF_PTA(AMX_TF32)
DEF_PTA(AMX_FP8)
@@ -177,11 +177,12 @@ along with GCC; see the file COPYING3. If not see
#define m_ZNVER3 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER3)
#define m_ZNVER4 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER4)
#define m_ZNVER5 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER5)
+#define m_ZNVER6 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER6)
#define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1)
#define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2)
#define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
#define m_BTVER (m_BTVER1 | m_BTVER2)
-#define m_ZNVER (m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5)
+#define m_ZNVER (m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_ZNVER6)
#define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
| m_ZNVER)
@@ -263,6 +264,7 @@ static struct ix86_target_opts isa2_opts[] =
{ "-musermsr", OPTION_MASK_ISA2_USER_MSR },
{ "-mavx10.1", OPTION_MASK_ISA2_AVX10_1 },
{ "-mavx10.2", OPTION_MASK_ISA2_AVX10_2 },
+ { "-mavx512bmm", OPTION_MASK_ISA2_AVX512BMM },
{ "-mamx-avx512", OPTION_MASK_ISA2_AMX_AVX512 },
{ "-mamx-tf32", OPTION_MASK_ISA2_AMX_TF32 },
{ "-mamx-fp8", OPTION_MASK_ISA2_AMX_FP8 },
@@ -811,7 +813,8 @@ static const struct processor_costs *processor_cost_table[] =
&znver2_cost, /* PROCESSOR_ZNVER2. */
&znver3_cost, /* PROCESSOR_ZNVER3. */
&znver4_cost, /* PROCESSOR_ZNVER4. */
- &znver5_cost /* PROCESSOR_ZNVER5. */
+ &znver5_cost, /* PROCESSOR_ZNVER5. */
+ &znver6_cost /* PROCESSOR_ZNVER6. */
};
/* Guarantee that the array is aligned with enum processor_type. */
@@ -1122,6 +1125,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
IX86_ATTR_ISA ("usermsr", OPT_musermsr),
IX86_ATTR_ISA ("avx10.1", OPT_mavx10_1),
IX86_ATTR_ISA ("avx10.2", OPT_mavx10_2),
+ IX86_ATTR_ISA ("avx512bmm", OPT_mavx512bmm),
IX86_ATTR_ISA ("amx-avx512", OPT_mamx_avx512),
IX86_ATTR_ISA ("amx-tf32", OPT_mamx_tf32),
IX86_ATTR_ISA ("amx-fp8", OPT_mamx_fp8),
@@ -25543,7 +25543,7 @@ ix86_reassociation_width (unsigned int op, machine_mode mode)
return 1;
/* Znver5 can do 2 integer multiplications per cycle with latency
of 3. */
- if (ix86_tune == PROCESSOR_ZNVER5
+ if ((ix86_tune == PROCESSOR_ZNVER5 || ix86_tune == PROCESSOR_ZNVER6)
&& INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
width = 6;
@@ -2377,6 +2377,7 @@ enum processor_type
PROCESSOR_ZNVER3,
PROCESSOR_ZNVER4,
PROCESSOR_ZNVER5,
+ PROCESSOR_ZNVER6,
PROCESSOR_max
};
@@ -2522,6 +2523,8 @@ constexpr wide_int_bitmask PTA_ZNVER4 = PTA_ZNVER3 | PTA_AVX512F | PTA_AVX512DQ
| PTA_AVX512VNNI | PTA_AVX512BITALG | PTA_AVX512VPOPCNTDQ;
constexpr wide_int_bitmask PTA_ZNVER5 = PTA_ZNVER4 | PTA_AVXVNNI
| PTA_MOVDIRI | PTA_MOVDIR64B | PTA_AVX512VP2INTERSECT | PTA_PREFETCHI;
+constexpr wide_int_bitmask PTA_ZNVER6 = PTA_ZNVER5 | PTA_AVXVNNIINT8
+ | PTA_AVXNECONVERT | PTA_AVX512BMM | PTA_AVXIFMA | PTA_AVX512FP16;
constexpr wide_int_bitmask PTA_BTVER1 = PTA_64BIT | PTA_MMX | PTA_SSE
| PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_LZCNT | PTA_POPCNT
@@ -530,7 +530,7 @@
(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,nehalem,
atom,slm,glm,haswell,generic,lujiazui,yongfeng,amdfam10,bdver1,
bdver2,bdver3,bdver4,btver2,znver1,znver2,znver3,znver4,
- znver5"
+ znver5,znver6"
(const (symbol_ref "ix86_schedule")))
;; A basic instruction type. Refinements due to arguments to be
@@ -1353,6 +1353,10 @@ Target Mask(ISA2_AVX10_2) Var(ix86_isa_flags2) Save
Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2,
AVX10.1 and AVX10.2 built-in functions and code generation.
+mavx512bmm
+Target Mask(ISA2_AVX512BMM) Var(ix86_isa_flags2) Save
+Support AVX512BMM built-in functions and code generation.
+
mamx-avx512
Target Mask(ISA2_AMX_AVX512) Var(ix86_isa_flags2) Save
Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2,
@@ -614,3 +614,5 @@ UrlSuffix(gcc/x86-Options.html#index-mmovrs)
mamx-movrs
UrlSuffix(gcc/x86-Options.html#index-mamx-movrs)
+mavx512bmm
+UrlSuffix(gcc/x86-Options.html#index-mavx512bmm)
@@ -70,6 +70,10 @@
#include <avx512ifmavlintrin.h>
+#include <avx512bmmintrin.h>
+
+#include <avx512bmmvlintrin.h>
+
#include <avx512vbmiintrin.h>
#include <avx512vbmivlintrin.h>
@@ -251,6 +251,11 @@
UNSPEC_MINMAXBF16
UNSPEC_MINMAX
+ ;; For AVX512BMM support
+ UNSPEC_VBMACOR
+ UNSPEC_VBMACXOR
+ UNSPEC_VBITREV
+
;; For MOVRS suppport
UNSPEC_VMOVRS
])
@@ -33136,3 +33141,58 @@
(set_attr "prefix" "evex")
(set_attr "memory" "load")
(set_attr "mode" "<sseinsnmode>")])
+
+(define_mode_iterator VI1_AVX512BMM_HI
+ [V32HI (V16HI "TARGET_AVX512VL")])
+
+(define_insn "avx512bmm_vbmacor16x16x16_<mode>"
+ [(set (match_operand:VI1_AVX512BMM_HI 0 "register_operand" "=v")
+ (unspec:VI1_AVX512BMM_HI
+ [(match_operand:VI1_AVX512BMM_HI 1 "register_operand" "0")
+ (match_operand:VI1_AVX512BMM_HI 2 "register_operand" "v")
+ (match_operand:VI1_AVX512BMM_HI 3 "nonimmediate_operand" "vm")]
+ UNSPEC_VBMACOR))]
+ "TARGET_AVX512BMM"
+ "vbmacor16x16x16\t{%3, %2, %0|%0, %2, %3}"
+ [(set_attr ("prefix") ("evex"))
+ (set_attr "mode" "<sseinsnmode>")])
+
+
+(define_insn "avx512bmm_vbmacxor16x16x16_<mode>"
+ [(set (match_operand:VI1_AVX512BMM_HI 0 "register_operand" "=v")
+ (unspec:VI1_AVX512BMM_HI
+ [(match_operand:VI1_AVX512BMM_HI 1 "register_operand" "0")
+ (match_operand:VI1_AVX512BMM_HI 2 "register_operand" "v")
+ (match_operand:VI1_AVX512BMM_HI 3 "nonimmediate_operand" "vm")]
+ UNSPEC_VBMACXOR))]
+ "TARGET_AVX512BMM"
+ "vbmacxor16x16x16\t{%3, %2, %0|%0, %2, %3}"
+ [(set_attr ("prefix") ("evex"))
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_mode_iterator VI1_AVX512BMM_QI
+ [V64QI (V32QI "TARGET_AVX512VL") (V16QI "TARGET_AVX512VL")])
+
+(define_insn "avx512bmm_vbitrevb_<mode>_mask"
+ [(set (match_operand:VI1_AVX512BMM_QI 0 "register_operand" "=v")
+ (vec_merge:VI1_AVX512BMM_QI
+ (unspec:VI1_AVX512BMM_QI
+ [(match_operand:VI1_AVX512BMM_QI 1 "nonimmediate_operand" "vm")]
+ UNSPEC_VBITREV)
+ (match_operand:VI1_AVX512BMM_QI 2 "reg_or_0_operand" "0C")
+ (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
+ "TARGET_AVX512BMM"
+ "vbitrevb\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+ [(set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512bmm_vbitrevb_<mode>"
+ [(set (match_operand:VI1_AVX512BMM_QI 0 "register_operand" "=v")
+ (unspec:VI1_AVX512BMM_QI
+ [(match_operand:VI1_AVX512BMM_QI 1 "nonimmediate_operand" "vm")]
+ UNSPEC_VBITREV)
+ )]
+ "TARGET_AVX512BMM"
+ "vbitrevb\t{%1, %0|%0, %1}"
+ [(set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
@@ -2402,6 +2402,160 @@ struct processor_costs znver5_cost = {
COSTS_N_INSNS (2), /* Branch mispredict scale. */
};
+/* This table currently replicates znver5_cost table. */
+struct processor_costs znver6_cost = {
+ {
+ /* Start of register allocator costs. integer->integer move cost is 2. */
+
+ /* reg-reg moves are done by renaming and thus they are even cheaper than
+ 1 cycle. Because reg-reg move cost is 2 and following tables correspond
+ to doubles of latencies, we do not model this correctly. It does not
+ seem to make practical difference to bump prices up even more. */
+ 6, /* cost for loading QImode using
+ movzbl. */
+ {6, 6, 6}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {8, 8, 8}, /* cost of storing integer
+ registers. */
+ 2, /* cost of reg,reg fld/fst. */
+ {14, 14, 17}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode. */
+ {12, 12, 16}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode. */
+ 2, /* cost of moving MMX register. */
+ {6, 6}, /* cost of loading MMX registers
+ in SImode and DImode. */
+ {8, 8}, /* cost of storing MMX registers
+ in SImode and DImode. */
+ 2, 2, 3, /* cost of moving XMM,YMM,ZMM
+ register. */
+ {6, 6, 10, 10, 12}, /* cost of loading SSE registers
+ in 32,64,128,256 and 512-bit. */
+ {8, 8, 8, 12, 12}, /* cost of storing SSE registers
+ in 32,64,128,256 and 512-bit. */
+ 6, 8, /* SSE->integer and integer->SSE
+ moves. */
+ 8, 8, /* mask->integer and integer->mask moves */
+ {6, 6, 6}, /* cost of loading mask register
+ in QImode, HImode, SImode. */
+ {8, 8, 8}, /* cost if storing mask register
+ in QImode, HImode, SImode. */
+ 2, /* cost of moving mask register. */
+ /* End of register allocator costs. */
+ },
+
+ COSTS_N_INSNS (1), /* cost of an add instruction. */
+ /* TODO: Lea with 3 components has cost 2. */
+ COSTS_N_INSNS (1), /* cost of a lea instruction. */
+ COSTS_N_INSNS (1), /* variable shift costs. */
+ COSTS_N_INSNS (1), /* constant shift costs. */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
+ COSTS_N_INSNS (3), /* HI. */
+ COSTS_N_INSNS (3), /* SI. */
+ COSTS_N_INSNS (3), /* DI. */
+ COSTS_N_INSNS (3)}, /* other. */
+ 0, /* cost of multiply per each bit
+ set. */
+ {COSTS_N_INSNS (10), /* cost of a divide/mod for QI. */
+ COSTS_N_INSNS (11), /* HI. */
+ COSTS_N_INSNS (13), /* SI. */
+ COSTS_N_INSNS (16), /* DI. */
+ COSTS_N_INSNS (16)}, /* other. */
+ COSTS_N_INSNS (1), /* cost of movsx. */
+ COSTS_N_INSNS (1), /* cost of movzx. */
+ 8, /* "large" insn. */
+ 9, /* MOVE_RATIO. */
+ 6, /* CLEAR_RATIO */
+ {6, 6, 6}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {8, 8, 8}, /* cost of storing integer
+ registers. */
+ {6, 6, 10, 10, 12}, /* cost of loading SSE registers
+ in 32bit, 64bit, 128bit, 256bit and 512bit */
+ {8, 8, 8, 12, 12}, /* cost of storing SSE register
+ in 32bit, 64bit, 128bit, 256bit and 512bit */
+ {6, 6, 10, 10, 12}, /* cost of unaligned loads. */
+ {8, 8, 8, 12, 12}, /* cost of unaligned stores. */
+ 2, 2, 2, /* cost of moving XMM,YMM,ZMM
+ register. */
+ 6, /* cost of moving SSE register to integer. */
+ 6, /* cost of moving integer register to SSE. */
+ /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
+ throughput 5. Approx 7 uops do not depend on vector size and every load
+ is 5 uops. */
+ 14, 10, /* Gather load static, per_elt. */
+ 14, 20, /* Gather store static, per_elt. */
+ 32, /* size of l1 cache. */
+ 1024, /* size of l2 cache. */
+ 64, /* size of prefetch block. */
+ /* New AMD processors never drop prefetches; if they cannot be performed
+ immediately, they are queued. We set number of simultaneous prefetches
+ to a large constant to reflect this (it probably is not a good idea not
+ to limit number of prefetches at all, as their execution also takes some
+ time). */
+ 100, /* number of parallel prefetches. */
+ 3, /* Branch cost. */
+ COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (7), /* cost of FMUL instruction. */
+ /* Latency of fdiv is 8-15. */
+ COSTS_N_INSNS (15), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (1), /* cost of FABS instruction. */
+ COSTS_N_INSNS (1), /* cost of FCHS instruction. */
+ /* Latency of fsqrt is 4-10. */
+ COSTS_N_INSNS (25), /* cost of FSQRT instruction. */
+
+ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
+ COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
+ COSTS_N_INSNS (3), /* cost of MULSS instruction. */
+ COSTS_N_INSNS (3), /* cost of MULSD instruction. */
+ COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
+ COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
+ COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
+ /* 9-13. */
+ COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
+ COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
+ COSTS_N_INSNS (20), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (5), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (5), /* cost of 512bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (6), /* cost of CVTSI2SS instruction. */
+ COSTS_N_INSNS (6), /* cost of CVT(T)SS2SI instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
+ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
+ /* Zen5 can execute:
+ - integer ops: 6 per cycle, at most 3 multiplications.
+ latency 1 for additions, 3 for multiplications (pipelined)
+
+ Setting width of 9 for multiplication is probably excessive
+ for register pressure.
+ - fp ops: 2 additions per cycle, latency 2-3
+ 2 multiplicaitons per cycle, latency 3
+ - vector intger ops: 4 additions, latency 1
+ 2 multiplications, latency 4
+ We increase width to 6 for multiplications
+ in ix86_reassociation_width. */
+ 6, 6, 4, 6, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 8, 6}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 4, /* Limit how much the autovectorizer
+ may unroll a loop. */
+ znver2_memcpy,
+ znver2_memset,
+ COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
+ COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
+ "16", /* Loop alignment. */
+ "16", /* Jump alignment. */
+ "0:0:8", /* Label alignment. */
+ "16", /* Func alignment. */
+ 4, /* Small unroll limit. */
+ 2, /* Small unroll factor. */
+ COSTS_N_INSNS (2), /* Branch mispredict scale. */
+};
+
/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
static stringop_algs skylake_memcpy[2] = {
{libcall,
@@ -113,6 +113,10 @@ ix86_issue_rate (void)
case PROCESSOR_NOVALAKE:
return 8;
+ /* Issue rate we are changing to 8 considering the Dispatch width */
+ case PROCESSOR_ZNVER6:
+ return 8;
+
default:
return 1;
}
@@ -438,6 +442,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
case PROCESSOR_ZNVER3:
case PROCESSOR_ZNVER4:
case PROCESSOR_ZNVER5:
+ case PROCESSOR_ZNVER6:
/* Stack engine allows to execute push&pop instructions in parall. */
if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
&& (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
@@ -147,13 +147,14 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND,
There is also limitation for immediate and displacement supported. */
DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
- m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER3 | m_ZNVER4 | m_ZNVER5)
+ m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER3 | m_ZNVER4 | m_ZNVER5
+ | m_ZNVER6)
/* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov
and the destination is used by alu. alu must be one of
ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR. */
DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu",
- m_ZNVER5 | m_GRANITERAPIDS | m_GRANITERAPIDS_D)
+ m_ZNVER5 | m_ZNVER6 | m_GRANITERAPIDS | m_GRANITERAPIDS_D)
/* X86_TUNE_FUSE_AND_BRANCH_MEM: Fuse alu with a subsequent conditional
jump instruction when alu contains memory operand.
@@ -519,7 +520,7 @@ DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
/* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
elements. */
DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
- ~(m_ZNVER4 | m_ZNVER5))
+ ~(m_ZNVER4 | m_ZNVER5 | m_ZNVER6))
/* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
elements. */
@@ -530,7 +531,7 @@ DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
/* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
elements. */
DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
- ~(m_ZNVER4 | m_ZNVER5))
+ ~(m_ZNVER4 | m_ZNVER5 | m_ZNVER6))
/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements. */
@@ -541,7 +542,7 @@ DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
/* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
elements. */
DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
- ~(m_ZNVER4 | m_ZNVER5))
+ ~(m_ZNVER4 | m_ZNVER5 | m_ZNVER6))
/* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain. */
@@ -551,13 +552,14 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER
/* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain. */
DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains",
- m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID
+ m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_ZNVER6 | m_CORE_HYBRID
| m_SAPPHIRERAPIDS | m_GRANITERAPIDS | m_GRANITERAPIDS_D
| m_DIAMONDRAPIDS | m_CORE_ATOM | m_GENERIC)
/* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain. */
-DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5
+ | m_ZNVER6)
/* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
for v2df vector reduction. */
@@ -622,7 +624,7 @@ DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces",
/* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit
AVX instructions. */
DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
- m_ZNVER4 | m_ZNVER5)
+ m_ZNVER4 | m_ZNVER5 | m_ZNVER6)
/* X86_TUNE_AVX512_TWO_EPILOGUES: Use two vector epilogues for 512-bit
vectorized loops. */
@@ -29068,6 +29068,9 @@ AMD Family 19h Zen version 4.
@item znver5
AMD Family 1ah Zen version 5.
+
+@item znver6
+AMD Family 1ah Zen version 6.
@end table
Here is an example:
@@ -1554,7 +1554,7 @@ See RS/6000 and PowerPC Options.
-mnoreturn-no-callee-saved-registers
-mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx
-mavx2 -mavx512f -mavx512cd -mavx512vl
--mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi -msha -maes
+-mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi -mavx512bmm -msha -maes
-mpclmul -mfsgsbase -mrdrnd -mf16c -mfma -mpconfig -mwbnoinvd
-mptwrite -mclflushopt -mclwb -mxsavec -mxsaves
-msse4a -m3dnow -m3dnowa -mpopcnt -mabm -mbmi -mtbm -mfma4 -mxop
@@ -35829,6 +35829,17 @@ AVX512BW, AVX512VL, AVX512BF16, AVX512VBMI, AVX512VBMI2, AVX512VNNI,
AVX512BITALG, AVX512VPOPCNTDQ, GFNI, AVXVNNI, MOVDIRI, MOVDIR64B,
AVX512VP2INTERSECT, PREFETCHI and 64-bit instruction set extensions.)
+@item znver6
+AMD Family 1ah core based CPUs with x86-64 instruction set support. (This
+supersets BMI, BMI2, CLWB, F16C, FMA, FSGSBASE, AVX, AVX2, ADCX, RDSEED,
+MWAITX, SHA, CLZERO, AES, PCLMUL, CX16, MOVBE, MMX, SSE, SSE2, SSE3, SSE4A,
+SSSE3, SSE4.1, SSE4.2, ABM, XSAVEC, XSAVES, CLFLUSHOPT, POPCNT, RDPID,
+WBNOINVD, PKU, VPCLMULQDQ, VAES, AVX512F, AVX512DQ, AVX512IFMA, AVX512CD,
+AVX512BW, AVX512VL, AVX512BF16, AVX512VBMI, AVX512VBMI2, AVX512VNNI,
+AVX512BITALG, AVX512VPOPCNTDQ, GFNI, AVXVNNI, MOVDIRI, MOVDIR64B,
+AVX512VP2INTERSECT, AVXNECONVERT, AVX512BMM, PREFETCHI and
+64-bit instruction set extensions.)
+
@item btver1
CPUs based on AMD Family 14h cores with x86-64 instruction set support. (This
supersets MMX, SSE, SSE2, SSE3, SSSE3, SSE4A, CX16, ABM and 64-bit
@@ -57,6 +57,10 @@ int __attribute__ ((target("arch=znver5"))) foo () {
return 11;
}
+int __attribute__ ((target("arch=znver6"))) foo () {
+ return 12;
+}
+
int main ()
{
int val = foo ();
@@ -83,6 +87,8 @@ int main ()
assert (val == 10);
else if (__builtin_cpu_is ("znver5"))
assert (val == 11);
+ else if (__builtin_cpu_is ("znver6"))
+ assert (val == 12);
else
assert (val == 0);
new file mode 100644
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bmm -O2" } */
+/* { dg-final { scan-assembler-times "vbmacor16x16x16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbmacxor16x16x16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <immintrin.h>
+
+volatile __m512i x,y,z;
+volatile __mmask64 m;
+
+__m512i extern
+avx512bmm_test (void)
+{
+ x = _mm512_vbmacor16x16x16_epi16 (x, y, z);
+
+ x = _mm512_vbmacxor16x16x16_epi16 (x, y, z);
+
+ x = _mm512_vbitrevb_epi8 (x);
+
+ x = _mm512_mask_vbitrevb_epi8 (m, x, y);
+
+ x = _mm512_maskz_vbitrevb_epi8 (m, x);
+}
new file mode 100644
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bmm -O2" } */
+/* { dg-final { scan-assembler-times "vbmacor16x16x16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbmacxor16x16x16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+typedef char v64qi __attribute__ ((__vector_size__ (64)));
+typedef short v32hi __attribute__ ((__vector_size__ (64)));
+
+v32hi
+f1 (v32hi a, v32hi b, v32hi c)
+{
+ return __builtin_ia32_vbmacor16x16x16_v32hi (a, b, c);
+}
+
+v32hi
+f2 (v32hi a, v32hi b, v32hi c)
+{
+ return __builtin_ia32_vbmacxor16x16x16_v32hi (a, b, c);
+}
+
+v64qi
+f3 (v64qi a, v64qi b)
+{
+ return __builtin_ia32_vbitrevb512_mask (a, b, 3);
+}
new file mode 100644
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bmm -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-times "vbmacor16x16x16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbmacxor16x16x16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+
+#include <immintrin.h>
+
+volatile __m256i x,y,z;
+volatile __m128i x_,y_,z_;
+volatile __mmask32 m;
+volatile __mmask16 m_;
+
+void extern
+avx512bmm_test (void)
+{
+ x = _mm256_vbmacor16x16x16_epi16 (x, y, z);
+
+ x = _mm256_vbmacxor16x16x16_epi16 (x, y, z);
+
+ x = _mm256_mask_vbitrevb_epi8 (m, x, y);
+ x_ = _mm128_mask_vbitrevb_epi8 (m_, x_, y_);
+
+ x = _mm256_maskz_vbitrevb_epi8 (m, y);
+ x_ = _mm128_maskz_vbitrevb_epi8 (m_, y_);
+
+ x = _mm256_vbitrevb_epi8 (x);
+ x_ = _mm128_vbitrevb_epi8 (x_);
+}
new file mode 100644
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bmm -mavx512vl -O2" } */
+/* { dg-final { scan-assembler-times "vbmacor16x16x16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbmacxor16x16x16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vbitrevb\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+typedef char v32qi __attribute__ ((__vector_size__ (32)));
+typedef char v16qi __attribute__ ((__vector_size__ (16)));
+typedef short v16hi __attribute__ ((__vector_size__ (32)));
+
+v16hi
+f1 (v16hi a, v16hi b, v16hi c)
+{
+ return __builtin_ia32_vbmacor16x16x16_v16hi (a, b, c);
+}
+
+v16hi
+f2 (v16hi a, v16hi b, v16hi c)
+{
+ return __builtin_ia32_vbmacxor16x16x16_v16hi (a, b, c);
+}
+
+v32qi
+f3 (v32qi a, v32qi b)
+{
+ return __builtin_ia32_vbitrevb256_mask (a, b, 3);
+}
+
+v16qi
+f4 (v16qi a, v16qi b)
+{
+ return __builtin_ia32_vbitrevb128_mask (a, b, 3);
+}
@@ -238,6 +238,7 @@ extern void test_arch_znver2 (void) __attribute__((__target__("arch=
extern void test_arch_znver3 (void) __attribute__((__target__("arch=znver3")));
extern void test_arch_znver4 (void) __attribute__((__target__("arch=znver4")));
extern void test_arch_znver5 (void) __attribute__((__target__("arch=znver5")));
+extern void test_arch_znver6 (void) __attribute__((__target__("arch=znver6")));
extern void test_tune_nocona (void) __attribute__((__target__("tune=nocona")));
extern void test_tune_core2 (void) __attribute__((__target__("tune=core2")));
@@ -265,6 +266,7 @@ extern void test_tune_znver2 (void) __attribute__((__target__("tune=
extern void test_tune_znver3 (void) __attribute__((__target__("tune=znver3")));
extern void test_tune_znver4 (void) __attribute__((__target__("tune=znver4")));
extern void test_tune_znver5 (void) __attribute__((__target__("tune=znver5")));
+extern void test_tune_znver6 (void) __attribute__((__target__("tune=znver6")));
extern void test_fpmath_sse (void) __attribute__((__target__("sse2,fpmath=sse")));
extern void test_fpmath_387 (void) __attribute__((__target__("sse2,fpmath=387")));