@@ -245,6 +245,10 @@ AARCH64_OPT_EXTENSION("gcs", GCS, (), (), (), "gcs")
AARCH64_OPT_EXTENSION("fp8", FP8, (SIMD), (), (), "fp8")
+AARCH64_OPT_EXTENSION("fp8fma", FP8FMA, (FP8), (), (), "fp8fma")
+
+AARCH64_OPT_EXTENSION("ssve-fp8fma", SSVE_FP8FMA, (SME2,FP8), (), (), "ssve-fp8fma")
+
AARCH64_OPT_EXTENSION("faminmax", FAMINMAX, (SIMD), (), (), "faminmax")
#undef AARCH64_OPT_FMV_EXTENSION
@@ -270,10 +270,12 @@ public:
CONSTEXPR unspec_based_function_base (int unspec_for_sint,
int unspec_for_uint,
int unspec_for_fp,
+ int unspec_for_mfp8 = -1,
unsigned int suffix_index = 0)
: m_unspec_for_sint (unspec_for_sint),
m_unspec_for_uint (unspec_for_uint),
m_unspec_for_fp (unspec_for_fp),
+ m_unspec_for_mfp8 (unspec_for_mfp8),
m_suffix_index (suffix_index)
{}
@@ -281,6 +283,9 @@ public:
int
unspec_for (const function_instance &instance) const
{
+ if (instance.fpm_mode == FPM_set)
+ return m_unspec_for_mfp8;
+
auto &suffix = instance.type_suffix (m_suffix_index);
return (!suffix.integer_p ? m_unspec_for_fp
: suffix.unsigned_p ? m_unspec_for_uint
@@ -292,6 +297,7 @@ public:
int m_unspec_for_sint;
int m_unspec_for_uint;
int m_unspec_for_fp;
+ int m_unspec_for_mfp8;
/* Which type suffix is used to choose between the unspecs. */
unsigned int m_suffix_index;
@@ -427,7 +433,7 @@ public:
CONSTEXPR sme_1mode_function (int unspec_for_sint, int unspec_for_uint,
int unspec_for_fp)
- : parent (unspec_for_sint, unspec_for_uint, unspec_for_fp, 1)
+ : parent (unspec_for_sint, unspec_for_uint, unspec_for_fp, -1, 1)
{}
rtx
@@ -457,7 +463,7 @@ public:
CONSTEXPR sme_2mode_function_t (int unspec_for_sint, int unspec_for_uint,
int unspec_for_fp)
- : parent (unspec_for_sint, unspec_for_uint, unspec_for_fp, 1)
+ : parent (unspec_for_sint, unspec_for_uint, unspec_for_fp, -1, 1)
{}
rtx
@@ -496,7 +502,8 @@ public:
{
int unspec = unspec_for (e);
insn_code icode;
- if (e.type_suffix (m_suffix_index).float_p)
+ if (e.type_suffix (m_suffix_index).float_p
+ && e.fpm_mode != FPM_set)
{
/* Put the operands in the normal (fma ...) order, with the accumulator
last. This fits naturally since that's also the unprinted operand
@@ -526,7 +533,8 @@ public:
{
int unspec = unspec_for (e);
insn_code icode;
- if (e.type_suffix (m_suffix_index).float_p)
+ if (e.type_suffix (m_suffix_index).float_p
+ && e.fpm_mode != FPM_set)
{
/* Put the operands in the normal (fma ...) order, with the accumulator
last. This fits naturally since that's also the unprinted operand
@@ -96,6 +96,7 @@ apply_predication (const function_instance &instance, tree return_type,
B - bfloat16_t
c - a predicate-as-counter
h<elt> - a half-sized version of <elt>
+ M - mfloat8_t
p - a predicate (represented as TYPE_SUFFIX_b)
q<elt> - a quarter-sized version of <elt>
s<bits> - a signed type with the given number of bits
@@ -140,6 +141,9 @@ parse_element_type (const function_instance &instance, const char *&format)
if (ch == 'B')
return TYPE_SUFFIX_bf16;
+ if (ch == 'M')
+ return TYPE_SUFFIX_mf8;
+
if (ch == 'q')
{
type_suffix_index suffix = parse_element_type (instance, format);
@@ -4015,6 +4019,44 @@ SHAPE (ternary_bfloat_lane)
typedef ternary_bfloat_lane_base<2> ternary_bfloat_lanex2_def;
SHAPE (ternary_bfloat_lanex2)
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, svmfloat8_t, svmfloat8_t, uint64_t)
+
+ where the final argument is an integer constant expression in the range
+ [0, 15]. */
+struct ternary_mfloat8_lane_def
+ : public ternary_resize2_lane_base<8, TYPE_mfloat, TYPE_mfloat>
+{
+ void
+ build (function_builder &b, const function_group_info &group) const override
+ {
+ gcc_assert (group.fpm_mode == FPM_set);
+ b.add_overloaded_functions (group, MODE_none);
+ build_all (b, "v0,v0,vM,vM,su64", group, MODE_none);
+ }
+
+ bool
+ check (function_checker &c) const override
+ {
+ return c.require_immediate_lane_index (3, 2, 1);
+ }
+
+ tree
+ resolve (function_resolver &r) const override
+ {
+ type_suffix_index type;
+ if (!r.check_num_arguments (5)
+ || (type = r.infer_vector_type (0)) == NUM_TYPE_SUFFIXES
+ || !r.require_vector_type (1, VECTOR_TYPE_svmfloat8_t)
+ || !r.require_vector_type (2, VECTOR_TYPE_svmfloat8_t)
+ || !r.require_integer_immediate (3)
+ || !r.require_scalar_type (4, "uint64_t"))
+ return error_mark_node;
+
+ return r.resolve_to (r.mode_suffix_id, type, TYPE_SUFFIX_mf8, GROUP_none);
+ }
+};
+SHAPE (ternary_mfloat8_lane)
+
/* sv<t0>_t svfoo[_t0](sv<t0>_t, svbfloatt16_t, svbfloat16_t)
sv<t0>_t svfoo[_n_t0](sv<t0>_t, svbfloat16_t, bfloat16_t). */
struct ternary_bfloat_opt_n_def
@@ -4030,6 +4072,42 @@ struct ternary_bfloat_opt_n_def
};
SHAPE (ternary_bfloat_opt_n)
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, svmfloatt8_t, svmfloat8_t)
+ sv<t0>_t svfoo[_n_t0](sv<t0>_t, svmfloat8_t, bfloat8_t). */
+struct ternary_mfloat8_opt_n_def
+ : public ternary_resize2_opt_n_base<8, TYPE_mfloat, TYPE_mfloat>
+{
+ void
+ build (function_builder &b, const function_group_info &group) const override
+ {
+ gcc_assert (group.fpm_mode == FPM_set);
+ b.add_overloaded_functions (group, MODE_none);
+ build_all (b, "v0,v0,vM,vM", group, MODE_none);
+ build_all (b, "v0,v0,vM,sM", group, MODE_n);
+ }
+
+ tree
+ resolve (function_resolver &r) const override
+ {
+ type_suffix_index type;
+ if (!r.check_num_arguments (4)
+ || (type = r.infer_vector_type (0)) == NUM_TYPE_SUFFIXES
+ || !r.require_vector_type (1, VECTOR_TYPE_svmfloat8_t)
+ || !r.require_vector_or_scalar_type (2)
+ || !r.require_scalar_type (3, "uint64_t"))
+ return error_mark_node;
+
+ auto mode = r.mode_suffix_id;
+ if (r.scalar_argument_p (2))
+ mode = MODE_n;
+ else if (!r.require_vector_type (2, VECTOR_TYPE_svmfloat8_t))
+ return error_mark_node;
+
+ return r.resolve_to (mode, type, TYPE_SUFFIX_mf8, GROUP_none);
+ }
+};
+SHAPE (ternary_mfloat8_opt_n)
+
/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:int:quarter>_t, sv<t0:uint:quarter>_t,
uint64_t)
@@ -213,6 +213,8 @@ namespace aarch64_sve
extern const function_shape *const ternary_lane_rotate;
extern const function_shape *const ternary_long_lane;
extern const function_shape *const ternary_long_opt_n;
+ extern const function_shape *const ternary_mfloat8_lane;
+ extern const function_shape *const ternary_mfloat8_opt_n;
extern const function_shape *const ternary_opt_n;
extern const function_shape *const ternary_qq_or_011_lane;
extern const function_shape *const ternary_qq_lane_rotate;
@@ -990,16 +990,34 @@ FUNCTION (svminnmqv, reduction, (-1, -1, UNSPEC_FMINNMQV))
FUNCTION (svminp, unspec_based_pred_function, (UNSPEC_SMINP, UNSPEC_UMINP,
UNSPEC_FMINP))
FUNCTION (svminqv, reduction, (UNSPEC_SMINQV, UNSPEC_UMINQV, UNSPEC_FMINQV))
-FUNCTION (svmlalb, unspec_based_mla_function, (UNSPEC_SMULLB,
- UNSPEC_UMULLB, UNSPEC_FMLALB))
-FUNCTION (svmlalb_lane, unspec_based_mla_lane_function, (UNSPEC_SMULLB,
- UNSPEC_UMULLB,
- UNSPEC_FMLALB))
-FUNCTION (svmlalt, unspec_based_mla_function, (UNSPEC_SMULLT,
- UNSPEC_UMULLT, UNSPEC_FMLALT))
-FUNCTION (svmlalt_lane, unspec_based_mla_lane_function, (UNSPEC_SMULLT,
- UNSPEC_UMULLT,
- UNSPEC_FMLALT))
+FUNCTION (svmlalb_lane, unspec_based_mla_lane_function,
+ (UNSPEC_SMULLB, UNSPEC_UMULLB, UNSPEC_FMLALB,
+ UNSPEC_FMLALB_FP8))
+FUNCTION (svmlalb, unspec_based_mla_function,
+ (UNSPEC_SMULLB, UNSPEC_UMULLB, UNSPEC_FMLALB,
+ UNSPEC_FMLALB_FP8))
+FUNCTION (svmlallbb_lane, unspec_based_mla_lane_function,
+ (-1, -1, -1, UNSPEC_FMLALLBB_FP8))
+FUNCTION (svmlallbb, unspec_based_mla_function,
+ (-1, -1, -1, UNSPEC_FMLALLBB_FP8))
+FUNCTION (svmlallbt_lane, unspec_based_mla_lane_function,
+ (-1, -1, -1, UNSPEC_FMLALLBT_FP8))
+FUNCTION (svmlallbt, unspec_based_mla_function,
+ (-1, -1, -1, UNSPEC_FMLALLBT_FP8))
+FUNCTION (svmlalltb_lane, unspec_based_mla_lane_function,
+ (-1, -1, -1, UNSPEC_FMLALLTB_FP8))
+FUNCTION (svmlalltb, unspec_based_mla_function,
+ (-1, -1, -1, UNSPEC_FMLALLTB_FP8))
+FUNCTION (svmlalltt_lane, unspec_based_mla_lane_function,
+ (-1, -1, -1, UNSPEC_FMLALLTT_FP8))
+FUNCTION (svmlalltt, unspec_based_mla_function,
+ (-1, -1, -1, UNSPEC_FMLALLTT_FP8))
+FUNCTION (svmlalt_lane, unspec_based_mla_lane_function,
+ (UNSPEC_SMULLT, UNSPEC_UMULLT, UNSPEC_FMLALT,
+ UNSPEC_FMLALT_FP8))
+FUNCTION (svmlalt, unspec_based_mla_function,
+ (UNSPEC_SMULLT, UNSPEC_UMULLT, UNSPEC_FMLALT,
+ UNSPEC_FMLALT_FP8))
FUNCTION (svmlslb, unspec_based_mls_function, (UNSPEC_SMULLB,
UNSPEC_UMULLB, UNSPEC_FMLSLB))
FUNCTION (svmlslb_lane, unspec_based_mls_lane_function, (UNSPEC_SMULLB,
@@ -1072,15 +1090,15 @@ FUNCTION (svqrdmulh_lane, unspec_based_lane_function, (UNSPEC_SQRDMULH,
-1, -1))
FUNCTION (svqrshl, svqrshl_impl,)
FUNCTION (svqrshr, unspec_based_uncond_function, (UNSPEC_SQRSHR,
- UNSPEC_UQRSHR, -1, 1))
+ UNSPEC_UQRSHR, -1, -1, 1))
FUNCTION (svqrshrn, unspec_based_uncond_function, (UNSPEC_SQRSHRN,
- UNSPEC_UQRSHRN, -1, 1))
+ UNSPEC_UQRSHRN, -1, -1, 1))
FUNCTION (svqrshrnb, unspec_based_function, (UNSPEC_SQRSHRNB,
UNSPEC_UQRSHRNB, -1))
FUNCTION (svqrshrnt, unspec_based_function, (UNSPEC_SQRSHRNT,
UNSPEC_UQRSHRNT, -1))
-FUNCTION (svqrshru, unspec_based_uncond_function, (UNSPEC_SQRSHRU, -1, -1, 1))
-FUNCTION (svqrshrun, unspec_based_uncond_function, (UNSPEC_SQRSHRUN, -1, -1, 1))
+FUNCTION (svqrshru, unspec_based_uncond_function, (UNSPEC_SQRSHRU, -1, -1, -1, 1))
+FUNCTION (svqrshrun, unspec_based_uncond_function, (UNSPEC_SQRSHRUN, -1, -1, -1, 1))
FUNCTION (svqrshrunb, unspec_based_function, (UNSPEC_SQRSHRUNB, -1, -1))
FUNCTION (svqrshrunt, unspec_based_function, (UNSPEC_SQRSHRUNT, -1, -1))
FUNCTION (svqshl, svqshl_impl,)
@@ -379,3 +379,20 @@ DEF_SVE_FUNCTION_GS_FPM (svcvtn, unary_convertxn_narrow, cvtn_mf8, x2, none, set
DEF_SVE_FUNCTION_GS_FPM (svcvtnb, unary_convertxn_narrow, cvtnx_mf8, x2, none, set)
DEF_SVE_FUNCTION_GS_FPM (svcvtnt, unary_convertxn_narrowt, cvtnx_mf8, x2, none, set)
#undef REQUIRED_EXTENSIONS
+
+#define REQUIRED_EXTENSIONS \
+ streaming_compatible (AARCH64_FL_SVE2 | AARCH64_FL_FP8FMA, \
+ AARCH64_FL_SSVE_FP8FMA)
+DEF_SVE_FUNCTION_GS_FPM (svmlalb, ternary_mfloat8_opt_n, h_float_mf8, none, none, set)
+DEF_SVE_FUNCTION_GS_FPM (svmlalt, ternary_mfloat8_opt_n, h_float_mf8, none, none, set)
+DEF_SVE_FUNCTION_GS_FPM (svmlalb_lane, ternary_mfloat8_lane, h_float_mf8, none, none, set)
+DEF_SVE_FUNCTION_GS_FPM (svmlalt_lane, ternary_mfloat8_lane, h_float_mf8, none, none, set)
+DEF_SVE_FUNCTION_GS_FPM (svmlallbb, ternary_mfloat8_opt_n, s_float_mf8, none, none, set)
+DEF_SVE_FUNCTION_GS_FPM (svmlallbt, ternary_mfloat8_opt_n, s_float_mf8, none, none, set)
+DEF_SVE_FUNCTION_GS_FPM (svmlalltb, ternary_mfloat8_opt_n, s_float_mf8, none, none, set)
+DEF_SVE_FUNCTION_GS_FPM (svmlalltt, ternary_mfloat8_opt_n, s_float_mf8, none, none, set)
+DEF_SVE_FUNCTION_GS_FPM (svmlalltt_lane, ternary_mfloat8_lane, s_float_mf8, none, none, set)
+DEF_SVE_FUNCTION_GS_FPM (svmlallbb_lane, ternary_mfloat8_lane, s_float_mf8, none, none, set)
+DEF_SVE_FUNCTION_GS_FPM (svmlallbt_lane, ternary_mfloat8_lane, s_float_mf8, none, none, set)
+DEF_SVE_FUNCTION_GS_FPM (svmlalltb_lane, ternary_mfloat8_lane, s_float_mf8, none, none, set)
+#undef REQUIRED_EXTENSIONS
@@ -108,6 +108,14 @@ namespace aarch64_sve
extern const function_base *const svminqv;
extern const function_base *const svmlalb;
extern const function_base *const svmlalb_lane;
+ extern const function_base *const svmlallbb_lane;
+ extern const function_base *const svmlallbb;
+ extern const function_base *const svmlallbt_lane;
+ extern const function_base *const svmlallbt;
+ extern const function_base *const svmlalltb_lane;
+ extern const function_base *const svmlalltb;
+ extern const function_base *const svmlalltt_lane;
+ extern const function_base *const svmlalltt;
extern const function_base *const svmlalt;
extern const function_base *const svmlalt_lane;
extern const function_base *const svmlslb;
@@ -347,10 +347,18 @@ CONSTEXPR const group_suffix_info group_suffixes[] = {
TYPES_s_data (S, D), \
TYPES_d_data (S, D)
+/* _f16_mf8. */
+#define TYPES_h_float_mf8(S, D) \
+ D (f16, mf8)
+
/* _f32. */
#define TYPES_s_float(S, D) \
S (f32)
+/* _f32_mf8. */
+#define TYPES_s_float_mf8(S, D) \
+ D (f32, mf8)
+
/* _f32
_s16 _s32 _s64
_u16 _u32 _u64. */
@@ -777,6 +785,7 @@ DEF_SVE_TYPES_ARRAY (bhs_widen);
DEF_SVE_TYPES_ARRAY (c);
DEF_SVE_TYPES_ARRAY (h_bfloat);
DEF_SVE_TYPES_ARRAY (h_float);
+DEF_SVE_TYPES_ARRAY (h_float_mf8);
DEF_SVE_TYPES_ARRAY (h_integer);
DEF_SVE_TYPES_ARRAY (hs_signed);
DEF_SVE_TYPES_ARRAY (hs_integer);
@@ -788,6 +797,7 @@ DEF_SVE_TYPES_ARRAY (hsd_integer);
DEF_SVE_TYPES_ARRAY (hsd_data);
DEF_SVE_TYPES_ARRAY (s_float);
DEF_SVE_TYPES_ARRAY (s_float_hsd_integer);
+DEF_SVE_TYPES_ARRAY (s_float_mf8);
DEF_SVE_TYPES_ARRAY (s_float_sd_integer);
DEF_SVE_TYPES_ARRAY (s_signed);
DEF_SVE_TYPES_ARRAY (s_unsigned);
@@ -67,6 +67,7 @@
;; ---- [INT] Shift-and-accumulate operations
;; ---- [INT] Shift-and-insert operations
;; ---- [INT] Sum of absolute differences
+;; ---- [FP] Mfloat8 Multiply-and-accumulate operations
;;
;; == Extending arithmetic
;; ---- [INT] Multi-register widening conversions
@@ -1993,6 +1994,86 @@ (define_insn "*aarch64_sve2_<su>aba<mode>"
}
)
+;; -------------------------------------------------------------------------
+;; ---- [FP] Mfloat8 Multiply-and-accumulate operations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FMLALB (vectors, FP8 to FP16)
+;; - FMLALT (vectors, FP8 to FP16)
+;; - FMLALB (indexed, FP8 to FP16)
+;; - FMLALT (indexed, FP8 to FP16)
+;; - FMLALLBB (vectors)
+;; - FMLALLBB (indexed)
+;; - FMLALLBT (vectors)
+;; - FMLALLBT (indexed)
+;; - FMLALLTB (vectors)
+;; - FMLALLTB (indexed)
+;; - FMLALLTT (vectors)
+;; - FMLALLTT (indexed)
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_sve_add_<sve2_fp8_fma_op_vnx8hf><mode>"
+ [(set (match_operand:VNx8HF_ONLY 0 "register_operand")
+ (unspec:VNx8HF_ONLY
+ [(match_operand:VNx8HF 1 "register_operand")
+ (match_operand:VNx16QI 2 "register_operand")
+ (match_operand:VNx16QI 3 "register_operand")
+ (reg:DI FPM_REGNUM)]
+ SVE2_FP8_TERNARY_VNX8HF))]
+ "TARGET_SSVE_FP8FMA"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+ [ w , 0 , w , w ; * ] <sve2_fp8_fma_op_vnx8hf>\t%0.h, %2.b, %3.b
+ [ ?&w , w , w , w ; yes ] movprfx\t%0, %1\;<sve2_fp8_fma_op_vnx8hf>\t%0.h, %2.b, %3.b
+ }
+)
+
+(define_insn "@aarch64_sve_add_<sve2_fp8_fma_op_vnx4sf><mode>"
+ [(set (match_operand:VNx4SF_ONLY 0 "register_operand")
+ (unspec:VNx4SF_ONLY
+ [(match_operand:VNx4SF 1 "register_operand")
+ (match_operand:VNx16QI 2 "register_operand")
+ (match_operand:VNx16QI 3 "register_operand")
+ (reg:DI FPM_REGNUM)]
+ SVE2_FP8_TERNARY_VNX4SF))]
+ "TARGET_SSVE_FP8FMA"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+ [ w , 0 , w , w ; * ] <sve2_fp8_fma_op_vnx4sf>\t%0.s, %2.b, %3.b
+ [ ?&w , w , w , w ; yes ] movprfx\t%0, %1\;<sve2_fp8_fma_op_vnx4sf>\t%0.s, %2.b, %3.b
+ }
+)
+
+(define_insn "@aarch64_sve_add_lane_<sve2_fp8_fma_op_vnx8hf><mode>"
+ [(set (match_operand:VNx8HF_ONLY 0 "register_operand")
+ (unspec:VNx8HF_ONLY
+ [(match_operand:VNx8HF 1 "register_operand")
+ (match_operand:VNx16QI 2 "register_operand")
+ (match_operand:VNx16QI 3 "register_operand")
+ (match_operand:SI 4 "const_int_operand")
+ (reg:DI FPM_REGNUM)]
+ SVE2_FP8_TERNARY_LANE_VNX8HF))]
+ "TARGET_SSVE_FP8FMA"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+ [ w , 0 , w , y ; * ] <sve2_fp8_fma_op_vnx8hf>\t%0.h, %2.b, %3.b[%4]
+ [ ?&w , w , w , y ; yes ] movprfx\t%0, %1\;<sve2_fp8_fma_op_vnx8hf>\t%0.h, %2.b, %3.b[%4]
+ }
+)
+
+(define_insn "@aarch64_sve_add_lane_<sve2_fp8_fma_op_vnx4sf><mode>"
+ [(set (match_operand:VNx4SF_ONLY 0 "register_operand")
+ (unspec:VNx4SF_ONLY
+ [(match_operand:VNx4SF 1 "register_operand")
+ (match_operand:VNx16QI 2 "register_operand")
+ (match_operand:VNx16QI 3 "register_operand")
+ (match_operand:SI 4 "const_int_operand")
+ (reg:DI FPM_REGNUM)]
+ SVE2_FP8_TERNARY_LANE_VNX4SF))]
+ "TARGET_SSVE_FP8FMA"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+ [ w , 0 , w , y ; * ] <sve2_fp8_fma_op_vnx4sf>\t%0.s, %2.b, %3.b[%4]
+ [ ?&w , w , w , y ; yes ] movprfx\t%0, %1\;<sve2_fp8_fma_op_vnx4sf>\t%0.s, %2.b, %3.b[%4]
+ }
+)
+
;; =========================================================================
;; == Extending arithmetic
;; =========================================================================
@@ -518,6 +518,15 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
&& (TARGET_SVE2 || TARGET_STREAMING) \
&& (TARGET_SME2 || TARGET_NON_STREAMING))
+/* fp8 multiply-accumulate instructions are enabled through +fp8fma. */
+#define TARGET_FP8FMA AARCH64_HAVE_ISA (FP8FMA)
+
+/* SVE2 versions of fp8 multiply-accumulate instructions are enabled for
+ non-streaming mode by +fp8fma and for streaming mode by +ssve-fp8fma. */
+#define TARGET_SSVE_FP8FMA \
+ (((TARGET_SVE2 && TARGET_FP8FMA) || TARGET_STREAMING) \
+ && (AARCH64_HAVE_ISA (SSVE_FP8FMA) || TARGET_NON_STREAMING))
+
/* Standard register usage. */
/* 31 64-bit general purpose registers R0-R30:
@@ -430,6 +430,7 @@ (define_mode_iterator VMULD [V4HI V8HI V2SI V4SI
(define_mode_iterator VNx16QI_ONLY [VNx16QI])
(define_mode_iterator VNx16SI_ONLY [VNx16SI])
(define_mode_iterator VNx8HI_ONLY [VNx8HI])
+(define_mode_iterator VNx8HF_ONLY [VNx8HF])
(define_mode_iterator VNx8BF_ONLY [VNx8BF])
(define_mode_iterator VNx8SI_ONLY [VNx8SI])
(define_mode_iterator VNx8SF_ONLY [VNx8SF])
@@ -975,7 +976,13 @@ (define_c_enum "unspec"
UNSPEC_FMINNMP ; Used in aarch64-sve2.md.
UNSPEC_FMINP ; Used in aarch64-sve2.md.
UNSPEC_FMLALB ; Used in aarch64-sve2.md.
+ UNSPEC_FMLALB_FP8 ; Used in aarch64-sve2.md.
+ UNSPEC_FMLALLBB_FP8 ; Used in aarch64-sve2.md.
+ UNSPEC_FMLALLBT_FP8 ; Used in aarch64-sve2.md.
+ UNSPEC_FMLALLTB_FP8 ; Used in aarch64-sve2.md.
+ UNSPEC_FMLALLTT_FP8 ; Used in aarch64-sve2.md.
UNSPEC_FMLALT ; Used in aarch64-sve2.md.
+ UNSPEC_FMLALT_FP8 ; Used in aarch64-sve2.md.
UNSPEC_FMLSLB ; Used in aarch64-sve2.md.
UNSPEC_FMLSLT ; Used in aarch64-sve2.md.
UNSPEC_FP8FCVTN ; Used in aarch64-sve2.md.
@@ -4755,3 +4762,33 @@ (define_int_attr fp8_cvt_uns_op
(UNSPEC_F2CVT "f2cvt")
(UNSPEC_F1CVTLT "f1cvtlt")
(UNSPEC_F2CVTLT "f2cvtlt")])
+
+(define_int_iterator SVE2_FP8_TERNARY_VNX8HF
+ [UNSPEC_FMLALB_FP8
+ UNSPEC_FMLALT_FP8])
+
+(define_int_iterator SVE2_FP8_TERNARY_VNX4SF
+ [UNSPEC_FMLALLBB_FP8
+ UNSPEC_FMLALLBT_FP8
+ UNSPEC_FMLALLTB_FP8
+ UNSPEC_FMLALLTT_FP8])
+
+(define_int_iterator SVE2_FP8_TERNARY_LANE_VNX8HF
+ [UNSPEC_FMLALB_FP8
+ UNSPEC_FMLALT_FP8])
+
+(define_int_iterator SVE2_FP8_TERNARY_LANE_VNX4SF
+ [UNSPEC_FMLALLBB_FP8
+ UNSPEC_FMLALLBT_FP8
+ UNSPEC_FMLALLTB_FP8
+ UNSPEC_FMLALLTT_FP8])
+
+(define_int_attr sve2_fp8_fma_op_vnx8hf
+ [(UNSPEC_FMLALB_FP8 "fmlalb")
+ (UNSPEC_FMLALT_FP8 "fmlalt")])
+
+(define_int_attr sve2_fp8_fma_op_vnx4sf
+ [(UNSPEC_FMLALLBB_FP8 "fmlallbb")
+ (UNSPEC_FMLALLBT_FP8 "fmlallbt")
+ (UNSPEC_FMLALLTB_FP8 "fmlalltb")
+ (UNSPEC_FMLALLTT_FP8 "fmlalltt")])
@@ -21952,6 +21952,11 @@ Enable support for Armv8.9-a/9.4-a translation hardening extension.
Enable the RCpc3 (Release Consistency) extension.
@item fp8
Enable the fp8 (8-bit floating point) extension.
+@item fp8fma
+Enable the fp8 (8-bit floating point) multiply accumulate extension.
+@item ssve-fp8fma
+Enable the fp8 (8-bit floating point) multiply accumulate extension in streaming
+mode.
@item faminmax
Enable the Floating Point Absolute Maximum/Minimum extension.
@item sve-b16b16
@@ -84,7 +84,7 @@
#define TEST_DUAL_Z_REV(NAME, TYPE1, TYPE2, CODE1, CODE2) \
PROTO (NAME, TYPE1, (TYPE2 z0, TYPE2 z1, TYPE2 z2, TYPE2 z3, \
TYPE1 z4, TYPE1 z5, TYPE1 z6, TYPE1 z7, \
- svbool_t p0, svbool_t p1)) \
+ svbool_t p0, svbool_t p1, fpm_t fpm0)) \
{ \
TYPE1 z0_res; \
INVOKE (CODE1, CODE2); \
@@ -136,7 +136,7 @@
}
#define TEST_DUAL_LANE_REG(NAME, ZTYPE1, ZTYPE2, REG, CODE1, CODE2) \
- PROTO (NAME, void, (void)) \
+ PROTO (NAME, void, (fpm_t fpm0)) \
{ \
register ZTYPE1 z0 __asm ("z0"); \
register ZTYPE2 z1 __asm ("z1"); \
@@ -194,7 +194,7 @@
PROTO (NAME, ZTYPE1, (ZTYPE1 z0, ZTYPE1 z1, ZTYPE1 z2, \
ZTYPE1 z3, ZTYPE2 z4, ZTYPE2 z5, \
ZTYPE2 z6, STYPE d7, svbool_t p0, \
- svbool_t p1)) \
+ svbool_t p1, fpm_t fpm0)) \
{ \
INVOKE (CODE1, CODE2); \
return z0; \
new file mode 100644
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+#pragma GCC target ("arch=armv8.2-a+ssve-fp8fma")
+
+void
+f1 (svfloat16_t f16, svmfloat8_t f8, fpm_t fpm,
+ svbool_t pg, svuint8_t u8, svuint16_t u16, svint32_t s32,
+ svbfloat16_t bf16, svfloat32_t f32, svfloat64_t f64, mfloat8_t f, int i)
+ __arm_streaming
+{
+ svmlalb_lane_fpm (f16, f8, f8, 0, fpm);
+ svmlalb_lane_fpm (f16, f8, f8, 7, fpm);
+ svmlalb_lane_fpm (f16, f8, f8, 8, fpm);
+ svmlalb_lane_fpm (f16, f8, f8, 15, fpm);
+
+ svmlalb_lane_fpm (f16); /* { dg-error {too few arguments to function 'svmlalb_lane_fpm'} } */
+ svmlalb_lane_fpm (f16, f8); /* { dg-error {too few arguments to function 'svmlalb_lane_fpm'} } */
+ svmlalb_lane_fpm (f16, f8, f8); /* { dg-error {too few arguments to function 'svmlalb_lane_fpm'} } */
+ svmlalb_lane_fpm (f16, f8, f8, 0); /* { dg-error {too few arguments to function 'svmlalb_lane_fpm'} } */
+ svmlalb_lane_fpm (f16, f8, f8, fpm); /* { dg-error {too few arguments to function 'svmlalb_lane_fpm'} } */
+ svmlalb_lane_fpm (f16, f8, 15, fpm); /* { dg-error {too few arguments to function 'svmlalb_lane_fpm'} } */
+ svmlalb_lane_fpm (f8, f8, 15, fpm); /* { dg-error {too few arguments to function 'svmlalb_lane_fpm'} } */
+
+ svmlalb_lane_fpm (f16, f8, f8, 15, 0, fpm); /* { dg-error {too many arguments to function 'svmlalb_lane_fpm'} } */
+ svmlalb_lane_fpm (f16, f8, f8, 15, fpm, fpm); /* { dg-error {too many arguments to function 'svmlalb_lane_fpm'} } */
+ svmlalb_lane_fpm (f16, f8, f8, f8, 15, fpm); /* { dg-error {too many arguments to function 'svmlalb_lane_fpm'} } */
+ svmlalb_lane_fpm (f16, f16, f8, f8, 15, fpm); /* { dg-error {too many arguments to function 'svmlalb_lane_fpm'} } */
+
+ svmlalb_lane_fpm (f32, bf16, bf16, 0, fpm); /* { dg-error {passing 'svbfloat16_t' to argument 2 of 'svmlalb_lane_fpm', which expects 'svmfloat8_t'} } */
+ svmlalb_lane_fpm (0, f8, f8, 0, fpm); /* { dg-error {passing 'int' to argument 1 of 'svmlalb_lane_fpm', which expects an SVE type rather than a scalar} } */
+ svmlalb_lane_fpm (pg, f8, f8, 0, fpm); /* { dg-error {'svmlalb_lane_fpm' has no form that takes 'svbool_t' and 'svmfloat8_t' arguments} } */
+ svmlalb_lane_fpm (u8, f8, f8, 0, fpm); /* { dg-error {'svmlalb_lane_fpm' has no form that takes 'svuint8_t' and 'svmfloat8_t' arguments} } */
+ svmlalb_lane_fpm (u16, f8, f8, 0, fpm); /* { dg-error {'svmlalb_lane_fpm' has no form that takes 'svuint16_t' and 'svmfloat8_t' arguments} } */
+ svmlalb_lane_fpm (f32, f8, f8, 0, fpm); /* { dg-error {'svmlalb_lane_fpm' has no form that takes 'svfloat32_t' and 'svmfloat8_t' arguments} } */
+ svmlalb_lane_fpm (f64, f8, f8, 0, fpm); /* { dg-error {'svmlalb_lane_fpm' has no form that takes 'svfloat64_t' and 'svmfloat8_t' arguments} } */
+ svmlalb_lane_fpm (f16, 0, f8, 0, fpm); /* { dg-error {passing 'int' to argument 2 of 'svmlalb_lane_fpm', which expects 'svmfloat8_t'} } */
+ svmlalb_lane_fpm (f16, f32, f8, 0, fpm); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svmlalb_lane_fpm', which expects 'svmfloat8_t'} } */
+ svmlalb_lane_fpm (f16, f8, 0, 0, fpm); /* { dg-error {passing 'int' to argument 3 of 'svmlalb_lane_fpm', which expects 'svmfloat8_t'} } */
+ svmlalb_lane_fpm (f16, f8, f32, 0, fpm); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svmlalb_lane_fpm', which expects 'svmfloat8_t'} } */
+
+ svmlalb_lane_fpm (f16, f8, f8, s32, fpm); /* { dg-error {argument 4 of 'svmlalb_lane_fpm' must be an integer constant expression} } */
+ svmlalb_lane_fpm (f16, f8, f8, i, fpm); /* { dg-error {argument 4 of 'svmlalb_lane_fpm' must be an integer constant expression} } */
+ svmlalb_lane_fpm (f16, f8, f8, 16, fpm); /* { dg-error {passing 16 to argument 4 of 'svmlalb_lane_fpm', which expects a value in the range \[0, 15\]} } */
+ svmlalb_lane_fpm (f16, f8, f8, -1, fpm); /* { dg-error {passing -1 to argument 4 of 'svmlalb_lane_fpm', which expects a value in the range \[0, 15\]} } */
+ svmlalb_lane_fpm (f16, f8, f8, 15, f8); /* { dg-error {passing 'svmfloat8_t' to argument 5 of 'svmlalb_lane_fpm', which expects 'uint64_t'} } */
+
+
+ svmlallbb_lane_fpm (f32, f8, f8, 0, fpm);
+ svmlallbb_lane_fpm (f32, f8, f8, 7, fpm);
+ svmlallbb_lane_fpm (f32, f8, f8, 8, fpm);
+ svmlallbb_lane_fpm (f32, f8, f8, 15, fpm);
+
+ svmlallbb_lane_fpm (f32); /* { dg-error {too few arguments to function 'svmlallbb_lane_fpm'} } */
+ svmlallbb_lane_fpm (f32, f8); /* { dg-error {too few arguments to function 'svmlallbb_lane_fpm'} } */
+ svmlallbb_lane_fpm (f32, f8, f8); /* { dg-error {too few arguments to function 'svmlallbb_lane_fpm'} } */
+ svmlallbb_lane_fpm (f32, f8, f8, 0); /* { dg-error {too few arguments to function 'svmlallbb_lane_fpm'} } */
+ svmlallbb_lane_fpm (f32, f8, f8, fpm); /* { dg-error {too few arguments to function 'svmlallbb_lane_fpm'} } */
+ svmlallbb_lane_fpm (f32, f8, 15, fpm); /* { dg-error {too few arguments to function 'svmlallbb_lane_fpm'} } */
+ svmlallbb_lane_fpm (f8, f8, 15, fpm); /* { dg-error {too few arguments to function 'svmlallbb_lane_fpm'} } */
+
+ svmlallbb_lane_fpm (f32, f8, f8, 15, 0, fpm); /* { dg-error {too many arguments to function 'svmlallbb_lane_fpm'} } */
+ svmlallbb_lane_fpm (f32, f8, f8, 15, fpm, fpm); /* { dg-error {too many arguments to function 'svmlallbb_lane_fpm'} } */
+ svmlallbb_lane_fpm (f32, f8, f8, f8, 15, fpm); /* { dg-error {too many arguments to function 'svmlallbb_lane_fpm'} } */
+ svmlallbb_lane_fpm (f32, f16, f8, f8, 15, fpm); /* { dg-error {too many arguments to function 'svmlallbb_lane_fpm'} } */
+
+ svmlallbb_lane_fpm (f32, bf16, bf16, 0, fpm); /* { dg-error {passing 'svbfloat16_t' to argument 2 of 'svmlallbb_lane_fpm', which expects 'svmfloat8_t'} } */
+ svmlallbb_lane_fpm (0, f8, f8, 0, fpm); /* { dg-error {passing 'int' to argument 1 of 'svmlallbb_lane_fpm', which expects an SVE type rather than a scalar} } */
+ svmlallbb_lane_fpm (pg, f8, f8, 0, fpm); /* { dg-error {'svmlallbb_lane_fpm' has no form that takes 'svbool_t' and 'svmfloat8_t' arguments} } */
+ svmlallbb_lane_fpm (u8, f8, f8, 0, fpm); /* { dg-error {'svmlallbb_lane_fpm' has no form that takes 'svuint8_t' and 'svmfloat8_t' arguments} } */
+ svmlallbb_lane_fpm (u16, f8, f8, 0, fpm); /* { dg-error {'svmlallbb_lane_fpm' has no form that takes 'svuint16_t' and 'svmfloat8_t' arguments} } */
+ svmlallbb_lane_fpm (f16, f8, f8, 0, fpm); /* { dg-error {'svmlallbb_lane_fpm' has no form that takes 'svfloat16_t' and 'svmfloat8_t' arguments} } */
+ svmlallbb_lane_fpm (f64, f8, f8, 0, fpm); /* { dg-error {'svmlallbb_lane_fpm' has no form that takes 'svfloat64_t' and 'svmfloat8_t' arguments} } */
+ svmlallbb_lane_fpm (f32, 0, f8, 0, fpm); /* { dg-error {passing 'int' to argument 2 of 'svmlallbb_lane_fpm', which expects 'svmfloat8_t'} } */
+ svmlallbb_lane_fpm (f32, f32, f8, 0, fpm); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svmlallbb_lane_fpm', which expects 'svmfloat8_t'} } */
+ svmlallbb_lane_fpm (f32, f8, 0, 0, fpm); /* { dg-error {passing 'int' to argument 3 of 'svmlallbb_lane_fpm', which expects 'svmfloat8_t'} } */
+ svmlallbb_lane_fpm (f32, f8, f32, 0, fpm); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svmlallbb_lane_fpm', which expects 'svmfloat8_t'} } */
+
+ svmlallbb_lane_fpm (f32, f8, f8, s32, fpm); /* { dg-error {argument 4 of 'svmlallbb_lane_fpm' must be an integer constant expression} } */
+ svmlallbb_lane_fpm (f32, f8, f8, i, fpm); /* { dg-error {argument 4 of 'svmlallbb_lane_fpm' must be an integer constant expression} } */
+ svmlallbb_lane_fpm (f32, f8, f8, 16, fpm); /* { dg-error {passing 16 to argument 4 of 'svmlallbb_lane_fpm', which expects a value in the range \[0, 15\]} } */
+ svmlallbb_lane_fpm (f32, f8, f8, -1, fpm); /* { dg-error {passing -1 to argument 4 of 'svmlallbb_lane_fpm', which expects a value in the range \[0, 15\]} } */
+}
new file mode 100644
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+#pragma GCC target ("arch=armv8.2-a+sve2+fp8fma")
+
+void
+test (svfloat16_t f16, svmfloat8_t f8, fpm_t fpm,
+ svbool_t pg, svuint8_t u8, svuint16_t u16, svint32_t s32,
+ svbfloat16_t bf16, svfloat32_t f32, svfloat64_t f64, mfloat8_t f)
+{
+ svmlalb_fpm (f16, f8, f8, fpm);
+ svmlalt_fpm (f16, f8, f8, fpm);
+ svmlalb_fpm (f16, f8, f, fpm);
+
+ svmlalb_fpm (f16); /* { dg-error {too few arguments to function 'svmlalb_fpm'} } */
+ svmlalb_fpm (f16, f8); /* { dg-error {too few arguments to function 'svmlalb_fpm'} } */
+ svmlalb_fpm (f16, f8, f8); /* { dg-error {too few arguments to function 'svmlalb_fpm'} } */
+ svmlalb_fpm (f8, f8, fpm); /* { dg-error {too few arguments to function 'svmlalb_fpm'} } */
+ svmlalb_fpm (f16, f8, fpm); /* { dg-error {too few arguments to function 'svmlalb_fpm'} } */
+ svmlalb_fpm (f16, f8, f8, fpm, 0); /* { dg-error {too many arguments to function 'svmlalb_fpm'} } */
+
+ svmlalt_fpm (f32, f8, f8, fpm); /* { dg-error {'svmlalt_fpm' has no form that takes 'svfloat32_t' and 'svmfloat8_t' arguments} } */
+ svmlalb_fpm (0, f8, f8, fpm); /* { dg-error {passing 'int' to argument 1 of 'svmlalb_fpm', which expects an SVE type rather than a scalar} } */
+ svmlalb_fpm (pg, f8, f8, fpm); /* { dg-error {'svmlalb_fpm' has no form that takes 'svbool_t' and 'svmfloat8_t' arguments} } */
+ svmlalb_fpm (u8, f8, f8, fpm); /* { dg-error {'svmlalb_fpm' has no form that takes 'svuint8_t' and 'svmfloat8_t' arguments} } */
+ svmlalb_fpm (u16, f8, f8, fpm); /* { dg-error {'svmlalb_fpm' has no form that takes 'svuint16_t' and 'svmfloat8_t' arguments} } */
+ svmlalb_fpm (f64, f8, f8, fpm); /* { dg-error {'svmlalb_fpm' has no form that takes 'svfloat64_t' and 'svmfloat8_t' arguments} } */
+ svmlalb_fpm (f16, 0, f8, fpm); /* { dg-error {passing 'int' to argument 2 of 'svmlalb_fpm', which expects 'svmfloat8_t'} } */
+ svmlalb_fpm (f16, f16, f8, fpm); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svmlalb_fpm', which expects 'svmfloat8_t'} } */
+ svmlalb_fpm (f16, f8, 0, fpm); /* { dg-error {invalid conversion to type 'mfloat8_t'} } */
+ svmlalb_fpm (f16, f8, f16, fpm); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svmlalb_fpm', which expects 'svmfloat8_t'} } */
+ svmlalb_fpm (f16, f8, f8, f8); /* { dg-error {passing 'svmfloat8_t' to argument 4 of 'svmlalb_fpm', which expects 'uint64_t'} } */
+
+
+ svmlallbb_fpm (f32, f8, f8, fpm);
+ svmlallbt_fpm (f32, f8, f8, fpm);
+ svmlalltb_fpm (f32, f8, f8, fpm);
+ svmlalltt_fpm (f32, f8, f8, fpm);
+ svmlallbb_fpm (f32, f8, f, fpm);
+
+ svmlallbb_fpm (f16, f8, f8, fpm); /* { dg-error {'svmlallbb_fpm' has no form that takes 'svfloat16_t' and 'svmfloat8_t' arguments} } */
+ svmlallbb_fpm (f32); /* { dg-error {too few arguments to function 'svmlallbb_fpm'} } */
+ svmlallbb_fpm (f32, f8); /* { dg-error {too few arguments to function 'svmlallbb_fpm'} } */
+ svmlallbb_fpm (f32, f8, f8); /* { dg-error {too few arguments to function 'svmlallbb_fpm'} } */
+ svmlallbb_fpm (f8, f8, fpm); /* { dg-error {too few arguments to function 'svmlallbb_fpm'} } */
+ svmlallbb_fpm (f32, f8, fpm); /* { dg-error {too few arguments to function 'svmlallbb_fpm'} } */
+ svmlallbb_fpm (f32, f8, f8, fpm, 0); /* { dg-error {too many arguments to function 'svmlallbb_fpm'} } */
+ svmlallbb_fpm (0, f8, f8, fpm); /* { dg-error {passing 'int' to argument 1 of 'svmlallbb_fpm', which expects an SVE type rather than a scalar} } */
+ svmlallbb_fpm (pg, f8, f8, fpm); /* { dg-error {'svmlallbb_fpm' has no form that takes 'svbool_t' and 'svmfloat8_t' arguments} } */
+ svmlallbb_fpm (u8, f8, f8, fpm); /* { dg-error {'svmlallbb_fpm' has no form that takes 'svuint8_t' and 'svmfloat8_t' arguments} } */
+ svmlallbb_fpm (u16, f8, f8, fpm); /* { dg-error {'svmlallbb_fpm' has no form that takes 'svuint16_t' and 'svmfloat8_t' arguments} } */
+ svmlallbb_fpm (f64, f8, f8, fpm); /* { dg-error {'svmlallbb_fpm' has no form that takes 'svfloat64_t' and 'svmfloat8_t' arguments} } */
+ svmlallbb_fpm (f32, 0, f8, fpm); /* { dg-error {passing 'int' to argument 2 of 'svmlallbb_fpm', which expects 'svmfloat8_t'} } */
+ svmlallbb_fpm (f32, f16, f8, fpm); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svmlallbb_fpm', which expects 'svmfloat8_t'} } */
+ svmlallbb_fpm (f32, f8, 0, fpm); /* { dg-error {invalid conversion to type 'mfloat8_t'} } */
+ svmlallbb_fpm (f32, f8, f16, fpm); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svmlallbb_fpm', which expects 'svmfloat8_t'} } */
+ svmlallbb_fpm (f32, f8, f8, f8); /* { dg-error {passing 'svmfloat8_t' to argument 4 of 'svmlallbb_fpm', which expects 'uint64_t'} } */
+
+}
new file mode 100644
@@ -0,0 +1,91 @@
+/* { dg-do assemble { target aarch64_asm_fp8fma_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_fp8fma_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+fp8fma"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+ssve-fp8fma"
+#endif
+
+/*
+** mlalb_lane_0_f16_tied1:
+** msr fpmr, x0
+** fmlalb z0\.h, z4\.b, z5\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z (mlalb_lane_0_f16_tied1, svfloat16_t, svmfloat8_t,
+ z0 = svmlalb_lane_f16_mf8_fpm (z0, z4, z5, 0, fpm0),
+ z0 = svmlalb_lane_fpm (z0, z4, z5, 0, fpm0))
+
+/*
+** mlalb_lane_0_f16_tied2:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalb z0\.h, \1\.b, z1\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z_REV (mlalb_lane_0_f16_tied2, svfloat16_t, svmfloat8_t,
+ z0_res = svmlalb_lane_f16_mf8_fpm (z4, z0, z1, 0, fpm0),
+ z0_res = svmlalb_lane_fpm (z4, z0, z1, 0, fpm0))
+
+/*
+** mlalb_lane_0_f16_tied3:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalb z0\.h, z1\.b, \1\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z_REV (mlalb_lane_0_f16_tied3, svfloat16_t, svmfloat8_t,
+ z0_res = svmlalb_lane_f16_mf8_fpm (z4, z1, z0, 0, fpm0),
+ z0_res = svmlalb_lane_fpm (z4, z1, z0, 0, fpm0))
+
+/*
+** mlalb_lane_0_f16_untied:
+** msr fpmr, x0
+** movprfx z0, z1
+** fmlalb z0\.h, z4\.b, z5\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z (mlalb_lane_0_f16_untied, svfloat16_t, svmfloat8_t,
+ z0 = svmlalb_lane_f16_mf8_fpm (z1, z4, z5, 0, fpm0),
+ z0 = svmlalb_lane_fpm (z1, z4, z5, 0, fpm0))
+
+/*
+** mlalb_lane_1_f16:
+** msr fpmr, x0
+** fmlalb z0\.h, z4\.b, z5\.b\[1\]
+** ret
+*/
+TEST_DUAL_Z (mlalb_lane_1_f16, svfloat16_t, svmfloat8_t,
+ z0 = svmlalb_lane_f16_mf8_fpm (z0, z4, z5, 1, fpm0),
+ z0 = svmlalb_lane_fpm (z0, z4, z5, 1, fpm0))
+
+/*
+** mlalb_lane_z8_f16:
+** ...
+** msr fpmr, x0
+** mov (z[0-7])\.d, z8\.d
+** fmlalb z0\.h, z1\.b, \1\.b\[1\]
+** ldr d8, \[sp\], 32
+** ret
+*/
+TEST_DUAL_LANE_REG (mlalb_lane_z8_f16, svfloat16_t, svmfloat8_t, z8,
+ z0 = svmlalb_lane_f16_mf8_fpm (z0, z1, z8, 1, fpm0),
+ z0 = svmlalb_lane_fpm (z0, z1, z8, 1, fpm0))
+
+/*
+** mlalb_lane_z16_f16:
+** ...
+** msr fpmr, x0
+** mov (z[0-7])\.d, z16\.d
+** fmlalb z0\.h, z1\.b, \1\.b\[15\]
+** ...
+** ret
+*/
+TEST_DUAL_LANE_REG (mlalb_lane_z16_f16, svfloat16_t, svmfloat8_t, z16,
+ z0 = svmlalb_lane_f16_mf8_fpm (z0, z1, z16, 15, fpm0),
+ z0 = svmlalb_lane_fpm (z0, z1, z16, 15, fpm0))
new file mode 100644
@@ -0,0 +1,78 @@
+/* { dg-do assemble { target aarch64_asm_fp8fma_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_fp8fma_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+fp8fma"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+ssve-fp8fma"
+#endif
+
+/*
+** mlalb_f16_mf8_tied1:
+** msr fpmr, x0
+** fmlalb z0\.h, z4\.b, z5\.b
+** ret
+*/
+TEST_DUAL_Z (mlalb_f16_mf8_tied1, svfloat16_t, svmfloat8_t,
+ z0 = svmlalb_f16_mf8_fpm (z0, z4, z5, fpm0),
+ z0 = svmlalb_fpm (z0, z4, z5, fpm0))
+
+/*
+** mlalb_f16_mf8_tied2:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalb z0\.h, \1\.b, z1\.b
+** ret
+*/
+TEST_DUAL_Z_REV (mlalb_f16_mf8_tied2, svfloat16_t, svmfloat8_t,
+ z0_res = svmlalb_f16_mf8_fpm (z4, z0, z1, fpm0),
+ z0_res = svmlalb_fpm (z4, z0, z1, fpm0))
+
+/*
+** mlalb_f16_mf8_tied3:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalb z0\.h, z1\.b, \1\.b
+** ret
+*/
+TEST_DUAL_Z_REV (mlalb_f16_mf8_tied3, svfloat16_t, svmfloat8_t,
+ z0_res = svmlalb_f16_mf8_fpm (z4, z1, z0, fpm0),
+ z0_res = svmlalb_fpm (z4, z1, z0, fpm0))
+
+/*
+** mlalb_f16_mf8_untied:
+** msr fpmr, x0
+** movprfx z0, z1
+** fmlalb z0\.h, z4\.b, z5\.b
+** ret
+*/
+TEST_DUAL_Z (mlalb_f16_mf8_untied, svfloat16_t, svmfloat8_t,
+ z0 = svmlalb_f16_mf8_fpm (z1, z4, z5, fpm0),
+ z0 = svmlalb_fpm (z1, z4, z5, fpm0))
+
+/*
+** mlalb_h7_f16_tied1:
+** msr fpmr, x0
+** mov (z[0-9]+\.b), b7
+** fmlalb z0\.h, z4\.b, \1
+** ret
+*/
+TEST_DUAL_ZD (mlalb_h7_f16_tied1, svfloat16_t, svmfloat8_t, mfloat8_t,
+ z0 = svmlalb_n_f16_mf8_fpm (z0, z4, d7, fpm0),
+ z0 = svmlalb_fpm (z0, z4, d7, fpm0))
+
+/*
+** mlalb_h7_f16_untied:
+** msr fpmr, x0
+** mov (z[0-9]+\.b), b7
+** movprfx z0, z1
+** fmlalb z0\.h, z4\.b, \1
+** ret
+*/
+TEST_DUAL_ZD (mlalb_h7_f16_untied, svfloat16_t, svmfloat8_t, mfloat8_t,
+ z0 = svmlalb_n_f16_mf8_fpm (z1, z4, d7, fpm0),
+ z0 = svmlalb_fpm (z1, z4, d7, fpm0))
new file mode 100644
@@ -0,0 +1,91 @@
+/* { dg-do assemble { target aarch64_asm_fp8fma_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_fp8fma_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+fp8fma"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+ssve-fp8fma"
+#endif
+
+/*
+** mlallbb_lane_0_f16_tied1:
+** msr fpmr, x0
+** fmlallbb z0\.s, z4\.b, z5\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z (mlallbb_lane_0_f16_tied1, svfloat32_t, svmfloat8_t,
+ z0 = svmlallbb_lane_f32_mf8_fpm (z0, z4, z5, 0, fpm0),
+ z0 = svmlallbb_lane_fpm (z0, z4, z5, 0, fpm0))
+
+/*
+** mlallbb_lane_0_f32_tied2:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlallbb z0\.s, \1\.b, z1\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z_REV (mlallbb_lane_0_f32_tied2, svfloat32_t, svmfloat8_t,
+ z0_res = svmlallbb_lane_f32_mf8_fpm (z4, z0, z1, 0, fpm0),
+ z0_res = svmlallbb_lane_fpm (z4, z0, z1, 0, fpm0))
+
+/*
+** mlallbb_lane_0_f32_tied3:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlallbb z0\.s, z1\.b, \1\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z_REV (mlallbb_lane_0_f32_tied3, svfloat32_t, svmfloat8_t,
+ z0_res = svmlallbb_lane_f32_mf8_fpm (z4, z1, z0, 0, fpm0),
+ z0_res = svmlallbb_lane_fpm (z4, z1, z0, 0, fpm0))
+
+/*
+** mlallbb_lane_0_f32_untied:
+** msr fpmr, x0
+** movprfx z0, z1
+** fmlallbb z0\.s, z4\.b, z5\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z (mlallbb_lane_0_f32_untied, svfloat32_t, svmfloat8_t,
+ z0 = svmlallbb_lane_f32_mf8_fpm (z1, z4, z5, 0, fpm0),
+ z0 = svmlallbb_lane_fpm (z1, z4, z5, 0, fpm0))
+
+/*
+** mlallbb_lane_1_f32:
+** msr fpmr, x0
+** fmlallbb z0\.s, z4\.b, z5\.b\[1\]
+** ret
+*/
+TEST_DUAL_Z (mlallbb_lane_1_f32, svfloat32_t, svmfloat8_t,
+ z0 = svmlallbb_lane_f32_mf8_fpm (z0, z4, z5, 1, fpm0),
+ z0 = svmlallbb_lane_fpm (z0, z4, z5, 1, fpm0))
+
+/*
+** mlallbb_lane_z8_f32:
+** ...
+** msr fpmr, x0
+** mov (z[0-7])\.d, z8\.d
+** fmlallbb z0\.s, z1\.b, \1\.b\[1\]
+** ldr d8, \[sp\], 32
+** ret
+*/
+TEST_DUAL_LANE_REG (mlallbb_lane_z8_f32, svfloat32_t, svmfloat8_t, z8,
+ z0 = svmlallbb_lane_f32_mf8_fpm (z0, z1, z8, 1, fpm0),
+ z0 = svmlallbb_lane_fpm (z0, z1, z8, 1, fpm0))
+
+/*
+** mlallbb_lane_z16_f32:
+** ...
+** msr fpmr, x0
+** mov (z[0-7])\.d, z16\.d
+** fmlallbb z0\.s, z1\.b, \1\.b\[15\]
+** ...
+** ret
+*/
+TEST_DUAL_LANE_REG (mlallbb_lane_z16_f32, svfloat32_t, svmfloat8_t, z16,
+ z0 = svmlallbb_lane_f32_mf8_fpm (z0, z1, z16, 15, fpm0),
+ z0 = svmlallbb_lane_fpm (z0, z1, z16, 15, fpm0))
new file mode 100644
@@ -0,0 +1,78 @@
+/* { dg-do assemble { target aarch64_asm_fp8fma_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_fp8fma_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+fp8fma"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+ssve-fp8fma"
+#endif
+
+/*
+** mlallbb_f32_mf8_tied1:
+** msr fpmr, x0
+** fmlallbb z0\.s, z4\.b, z5\.b
+** ret
+*/
+TEST_DUAL_Z (mlallbb_f32_mf8_tied1, svfloat32_t, svmfloat8_t,
+ z0 = svmlallbb_f32_mf8_fpm (z0, z4, z5, fpm0),
+ z0 = svmlallbb_fpm (z0, z4, z5, fpm0))
+
+/*
+** mlallbb_f32_mf8_tied2:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlallbb z0\.s, \1\.b, z1\.b
+** ret
+*/
+TEST_DUAL_Z_REV (mlallbb_f32_mf8_tied2, svfloat32_t, svmfloat8_t,
+ z0_res = svmlallbb_f32_mf8_fpm (z4, z0, z1, fpm0),
+ z0_res = svmlallbb_fpm (z4, z0, z1, fpm0))
+
+/*
+** mlallbb_f32_mf8_tied3:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlallbb z0\.s, z1\.b, \1\.b
+** ret
+*/
+TEST_DUAL_Z_REV (mlallbb_f32_mf8_tied3, svfloat32_t, svmfloat8_t,
+ z0_res = svmlallbb_f32_mf8_fpm (z4, z1, z0, fpm0),
+ z0_res = svmlallbb_fpm (z4, z1, z0, fpm0))
+
+/*
+** mlallbb_f32_mf8_untied:
+** msr fpmr, x0
+** movprfx z0, z1
+** fmlallbb z0\.s, z4\.b, z5\.b
+** ret
+*/
+TEST_DUAL_Z (mlallbb_f32_mf8_untied, svfloat32_t, svmfloat8_t,
+ z0 = svmlallbb_f32_mf8_fpm (z1, z4, z5, fpm0),
+ z0 = svmlallbb_fpm (z1, z4, z5, fpm0))
+
+/*
+** mlalb_h7_f16_tied1:
+** msr fpmr, x0
+** mov (z[0-9]+\.b), b7
+** fmlallbb z0\.s, z4\.b, \1
+** ret
+*/
+TEST_DUAL_ZD (mlalb_h7_f16_tied1, svfloat32_t, svmfloat8_t, mfloat8_t,
+ z0 = svmlallbb_n_f32_mf8_fpm (z0, z4, d7, fpm0),
+ z0 = svmlallbb_fpm (z0, z4, d7, fpm0))
+
+/*
+** mlalb_h7_f16_untied:
+** msr fpmr, x0
+** mov (z[0-9]+\.b), b7
+** movprfx z0, z1
+** fmlallbb z0\.s, z4\.b, \1
+** ret
+*/
+TEST_DUAL_ZD (mlalb_h7_f16_untied, svfloat32_t, svmfloat8_t, mfloat8_t,
+ z0 = svmlallbb_n_f32_mf8_fpm (z1, z4, d7, fpm0),
+ z0 = svmlallbb_fpm (z1, z4, d7, fpm0))
new file mode 100644
@@ -0,0 +1,91 @@
+/* { dg-do assemble { target aarch64_asm_fp8fma_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_fp8fma_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+fp8fma"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+ssve-fp8fma"
+#endif
+
+/*
+** mlallbt_lane_0_f16_tied1:
+** msr fpmr, x0
+** fmlallbt z0\.s, z4\.b, z5\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z (mlallbt_lane_0_f16_tied1, svfloat32_t, svmfloat8_t,
+ z0 = svmlallbt_lane_f32_mf8_fpm (z0, z4, z5, 0, fpm0),
+ z0 = svmlallbt_lane_fpm (z0, z4, z5, 0, fpm0))
+
+/*
+** mlallbt_lane_0_f32_tied2:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlallbt z0\.s, \1\.b, z1\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z_REV (mlallbt_lane_0_f32_tied2, svfloat32_t, svmfloat8_t,
+ z0_res = svmlallbt_lane_f32_mf8_fpm (z4, z0, z1, 0, fpm0),
+ z0_res = svmlallbt_lane_fpm (z4, z0, z1, 0, fpm0))
+
+/*
+** mlallbt_lane_0_f32_tied3:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlallbt z0\.s, z1\.b, \1\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z_REV (mlallbt_lane_0_f32_tied3, svfloat32_t, svmfloat8_t,
+ z0_res = svmlallbt_lane_f32_mf8_fpm (z4, z1, z0, 0, fpm0),
+ z0_res = svmlallbt_lane_fpm (z4, z1, z0, 0, fpm0))
+
+/*
+** mlallbt_lane_0_f32_untied:
+** msr fpmr, x0
+** movprfx z0, z1
+** fmlallbt z0\.s, z4\.b, z5\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z (mlallbt_lane_0_f32_untied, svfloat32_t, svmfloat8_t,
+ z0 = svmlallbt_lane_f32_mf8_fpm (z1, z4, z5, 0, fpm0),
+ z0 = svmlallbt_lane_fpm (z1, z4, z5, 0, fpm0))
+
+/*
+** mlallbt_lane_1_f32:
+** msr fpmr, x0
+** fmlallbt z0\.s, z4\.b, z5\.b\[1\]
+** ret
+*/
+TEST_DUAL_Z (mlallbt_lane_1_f32, svfloat32_t, svmfloat8_t,
+ z0 = svmlallbt_lane_f32_mf8_fpm (z0, z4, z5, 1, fpm0),
+ z0 = svmlallbt_lane_fpm (z0, z4, z5, 1, fpm0))
+
+/*
+** mlallbt_lane_z8_f32:
+** ...
+** msr fpmr, x0
+** mov (z[0-7])\.d, z8\.d
+** fmlallbt z0\.s, z1\.b, \1\.b\[1\]
+** ldr d8, \[sp\], 32
+** ret
+*/
+TEST_DUAL_LANE_REG (mlallbt_lane_z8_f32, svfloat32_t, svmfloat8_t, z8,
+ z0 = svmlallbt_lane_f32_mf8_fpm (z0, z1, z8, 1, fpm0),
+ z0 = svmlallbt_lane_fpm (z0, z1, z8, 1, fpm0))
+
+/*
+** mlallbt_lane_z16_f32:
+** ...
+** msr fpmr, x0
+** mov (z[0-7])\.d, z16\.d
+** fmlallbt z0\.s, z1\.b, \1\.b\[15\]
+** ...
+** ret
+*/
+TEST_DUAL_LANE_REG (mlallbt_lane_z16_f32, svfloat32_t, svmfloat8_t, z16,
+ z0 = svmlallbt_lane_f32_mf8_fpm (z0, z1, z16, 15, fpm0),
+ z0 = svmlallbt_lane_fpm (z0, z1, z16, 15, fpm0))
new file mode 100644
@@ -0,0 +1,78 @@
+/* { dg-do assemble { target aarch64_asm_fp8fma_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_fp8fma_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+fp8fma"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+ssve-fp8fma"
+#endif
+
+/*
+** mlallbt_f32_mf8_tied1:
+** msr fpmr, x0
+** fmlallbt z0\.s, z4\.b, z5\.b
+** ret
+*/
+TEST_DUAL_Z (mlallbt_f32_mf8_tied1, svfloat32_t, svmfloat8_t,
+ z0 = svmlallbt_f32_mf8_fpm (z0, z4, z5, fpm0),
+ z0 = svmlallbt_fpm (z0, z4, z5, fpm0))
+
+/*
+** mlallbt_f32_mf8_tied2:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlallbt z0\.s, \1\.b, z1\.b
+** ret
+*/
+TEST_DUAL_Z_REV (mlallbt_f32_mf8_tied2, svfloat32_t, svmfloat8_t,
+ z0_res = svmlallbt_f32_mf8_fpm (z4, z0, z1, fpm0),
+ z0_res = svmlallbt_fpm (z4, z0, z1, fpm0))
+
+/*
+** mlallbt_f32_mf8_tied3:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlallbt z0\.s, z1\.b, \1\.b
+** ret
+*/
+TEST_DUAL_Z_REV (mlallbt_f32_mf8_tied3, svfloat32_t, svmfloat8_t,
+ z0_res = svmlallbt_f32_mf8_fpm (z4, z1, z0, fpm0),
+ z0_res = svmlallbt_fpm (z4, z1, z0, fpm0))
+
+/*
+** mlallbt_f32_mf8_untied:
+** msr fpmr, x0
+** movprfx z0, z1
+** fmlallbt z0\.s, z4\.b, z5\.b
+** ret
+*/
+TEST_DUAL_Z (mlallbt_f32_mf8_untied, svfloat32_t, svmfloat8_t,
+ z0 = svmlallbt_f32_mf8_fpm (z1, z4, z5, fpm0),
+ z0 = svmlallbt_fpm (z1, z4, z5, fpm0))
+
+/*
+** mlalb_h7_f16_tied1:
+** msr fpmr, x0
+** mov (z[0-9]+\.b), b7
+** fmlallbt z0\.s, z4\.b, \1
+** ret
+*/
+TEST_DUAL_ZD (mlalb_h7_f16_tied1, svfloat32_t, svmfloat8_t, mfloat8_t,
+ z0 = svmlallbt_n_f32_mf8_fpm (z0, z4, d7, fpm0),
+ z0 = svmlallbt_fpm (z0, z4, d7, fpm0))
+
+/*
+** mlalb_h7_f16_untied:
+** msr fpmr, x0
+** mov (z[0-9]+\.b), b7
+** movprfx z0, z1
+** fmlallbt z0\.s, z4\.b, \1
+** ret
+*/
+TEST_DUAL_ZD (mlalb_h7_f16_untied, svfloat32_t, svmfloat8_t, mfloat8_t,
+ z0 = svmlallbt_n_f32_mf8_fpm (z1, z4, d7, fpm0),
+ z0 = svmlallbt_fpm (z1, z4, d7, fpm0))
new file mode 100644
@@ -0,0 +1,91 @@
+/* { dg-do assemble { target aarch64_asm_fp8fma_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_fp8fma_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+fp8fma"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+ssve-fp8fma"
+#endif
+
+/*
+** mlalltb_lane_0_f16_tied1:
+** msr fpmr, x0
+** fmlalltb z0\.s, z4\.b, z5\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z (mlalltb_lane_0_f16_tied1, svfloat32_t, svmfloat8_t,
+ z0 = svmlalltb_lane_f32_mf8_fpm (z0, z4, z5, 0, fpm0),
+ z0 = svmlalltb_lane_fpm (z0, z4, z5, 0, fpm0))
+
+/*
+** mlalltb_lane_0_f32_tied2:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalltb z0\.s, \1\.b, z1\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z_REV (mlalltb_lane_0_f32_tied2, svfloat32_t, svmfloat8_t,
+ z0_res = svmlalltb_lane_f32_mf8_fpm (z4, z0, z1, 0, fpm0),
+ z0_res = svmlalltb_lane_fpm (z4, z0, z1, 0, fpm0))
+
+/*
+** mlalltb_lane_0_f32_tied3:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalltb z0\.s, z1\.b, \1\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z_REV (mlalltb_lane_0_f32_tied3, svfloat32_t, svmfloat8_t,
+ z0_res = svmlalltb_lane_f32_mf8_fpm (z4, z1, z0, 0, fpm0),
+ z0_res = svmlalltb_lane_fpm (z4, z1, z0, 0, fpm0))
+
+/*
+** mlalltb_lane_0_f32_untied:
+** msr fpmr, x0
+** movprfx z0, z1
+** fmlalltb z0\.s, z4\.b, z5\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z (mlalltb_lane_0_f32_untied, svfloat32_t, svmfloat8_t,
+ z0 = svmlalltb_lane_f32_mf8_fpm (z1, z4, z5, 0, fpm0),
+ z0 = svmlalltb_lane_fpm (z1, z4, z5, 0, fpm0))
+
+/*
+** mlalltb_lane_1_f32:
+** msr fpmr, x0
+** fmlalltb z0\.s, z4\.b, z5\.b\[1\]
+** ret
+*/
+TEST_DUAL_Z (mlalltb_lane_1_f32, svfloat32_t, svmfloat8_t,
+ z0 = svmlalltb_lane_f32_mf8_fpm (z0, z4, z5, 1, fpm0),
+ z0 = svmlalltb_lane_fpm (z0, z4, z5, 1, fpm0))
+
+/*
+** mlalltb_lane_z8_f32:
+** ...
+** msr fpmr, x0
+** mov (z[0-7])\.d, z8\.d
+** fmlalltb z0\.s, z1\.b, \1\.b\[1\]
+** ldr d8, \[sp\], 32
+** ret
+*/
+TEST_DUAL_LANE_REG (mlalltb_lane_z8_f32, svfloat32_t, svmfloat8_t, z8,
+ z0 = svmlalltb_lane_f32_mf8_fpm (z0, z1, z8, 1, fpm0),
+ z0 = svmlalltb_lane_fpm (z0, z1, z8, 1, fpm0))
+
+/*
+** mlalltb_lane_z16_f32:
+** ...
+** msr fpmr, x0
+** mov (z[0-7])\.d, z16\.d
+** fmlalltb z0\.s, z1\.b, \1\.b\[15\]
+** ...
+** ret
+*/
+TEST_DUAL_LANE_REG (mlalltb_lane_z16_f32, svfloat32_t, svmfloat8_t, z16,
+ z0 = svmlalltb_lane_f32_mf8_fpm (z0, z1, z16, 15, fpm0),
+ z0 = svmlalltb_lane_fpm (z0, z1, z16, 15, fpm0))
new file mode 100644
@@ -0,0 +1,78 @@
+/* { dg-do assemble { target aarch64_asm_fp8fma_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_fp8fma_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+fp8fma"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+ssve-fp8fma"
+#endif
+
+/*
+** mlalltb_f32_mf8_tied1:
+** msr fpmr, x0
+** fmlalltb z0\.s, z4\.b, z5\.b
+** ret
+*/
+TEST_DUAL_Z (mlalltb_f32_mf8_tied1, svfloat32_t, svmfloat8_t,
+ z0 = svmlalltb_f32_mf8_fpm (z0, z4, z5, fpm0),
+ z0 = svmlalltb_fpm (z0, z4, z5, fpm0))
+
+/*
+** mlalltb_f32_mf8_tied2:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalltb z0\.s, \1\.b, z1\.b
+** ret
+*/
+TEST_DUAL_Z_REV (mlalltb_f32_mf8_tied2, svfloat32_t, svmfloat8_t,
+ z0_res = svmlalltb_f32_mf8_fpm (z4, z0, z1, fpm0),
+ z0_res = svmlalltb_fpm (z4, z0, z1, fpm0))
+
+/*
+** mlalltb_f32_mf8_tied3:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalltb z0\.s, z1\.b, \1\.b
+** ret
+*/
+TEST_DUAL_Z_REV (mlalltb_f32_mf8_tied3, svfloat32_t, svmfloat8_t,
+ z0_res = svmlalltb_f32_mf8_fpm (z4, z1, z0, fpm0),
+ z0_res = svmlalltb_fpm (z4, z1, z0, fpm0))
+
+/*
+** mlalltb_f32_mf8_untied:
+** msr fpmr, x0
+** movprfx z0, z1
+** fmlalltb z0\.s, z4\.b, z5\.b
+** ret
+*/
+TEST_DUAL_Z (mlalltb_f32_mf8_untied, svfloat32_t, svmfloat8_t,
+ z0 = svmlalltb_f32_mf8_fpm (z1, z4, z5, fpm0),
+ z0 = svmlalltb_fpm (z1, z4, z5, fpm0))
+
+/*
+** mlalb_h7_f16_tied1:
+** msr fpmr, x0
+** mov (z[0-9]+\.b), b7
+** fmlalltb z0\.s, z4\.b, \1
+** ret
+*/
+TEST_DUAL_ZD (mlalb_h7_f16_tied1, svfloat32_t, svmfloat8_t, mfloat8_t,
+ z0 = svmlalltb_n_f32_mf8_fpm (z0, z4, d7, fpm0),
+ z0 = svmlalltb_fpm (z0, z4, d7, fpm0))
+
+/*
+** mlalb_h7_f16_untied:
+** msr fpmr, x0
+** mov (z[0-9]+\.b), b7
+** movprfx z0, z1
+** fmlalltb z0\.s, z4\.b, \1
+** ret
+*/
+TEST_DUAL_ZD (mlalb_h7_f16_untied, svfloat32_t, svmfloat8_t, mfloat8_t,
+ z0 = svmlalltb_n_f32_mf8_fpm (z1, z4, d7, fpm0),
+ z0 = svmlalltb_fpm (z1, z4, d7, fpm0))
new file mode 100644
@@ -0,0 +1,91 @@
+/* { dg-do assemble { target aarch64_asm_fp8fma_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_fp8fma_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+fp8fma"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+ssve-fp8fma"
+#endif
+
+/*
+** mlalltt_lane_0_f16_tied1:
+** msr fpmr, x0
+** fmlalltt z0\.s, z4\.b, z5\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z (mlalltt_lane_0_f16_tied1, svfloat32_t, svmfloat8_t,
+ z0 = svmlalltt_lane_f32_mf8_fpm (z0, z4, z5, 0, fpm0),
+ z0 = svmlalltt_lane_fpm (z0, z4, z5, 0, fpm0))
+
+/*
+** mlalltt_lane_0_f32_tied2:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalltt z0\.s, \1\.b, z1\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z_REV (mlalltt_lane_0_f32_tied2, svfloat32_t, svmfloat8_t,
+ z0_res = svmlalltt_lane_f32_mf8_fpm (z4, z0, z1, 0, fpm0),
+ z0_res = svmlalltt_lane_fpm (z4, z0, z1, 0, fpm0))
+
+/*
+** mlalltt_lane_0_f32_tied3:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalltt z0\.s, z1\.b, \1\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z_REV (mlalltt_lane_0_f32_tied3, svfloat32_t, svmfloat8_t,
+ z0_res = svmlalltt_lane_f32_mf8_fpm (z4, z1, z0, 0, fpm0),
+ z0_res = svmlalltt_lane_fpm (z4, z1, z0, 0, fpm0))
+
+/*
+** mlalltt_lane_0_f32_untied:
+** msr fpmr, x0
+** movprfx z0, z1
+** fmlalltt z0\.s, z4\.b, z5\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z (mlalltt_lane_0_f32_untied, svfloat32_t, svmfloat8_t,
+ z0 = svmlalltt_lane_f32_mf8_fpm (z1, z4, z5, 0, fpm0),
+ z0 = svmlalltt_lane_fpm (z1, z4, z5, 0, fpm0))
+
+/*
+** mlalltt_lane_1_f32:
+** msr fpmr, x0
+** fmlalltt z0\.s, z4\.b, z5\.b\[1\]
+** ret
+*/
+TEST_DUAL_Z (mlalltt_lane_1_f32, svfloat32_t, svmfloat8_t,
+ z0 = svmlalltt_lane_f32_mf8_fpm (z0, z4, z5, 1, fpm0),
+ z0 = svmlalltt_lane_fpm (z0, z4, z5, 1, fpm0))
+
+/*
+** mlalltt_lane_z8_f32:
+** ...
+** msr fpmr, x0
+** mov (z[0-7])\.d, z8\.d
+** fmlalltt z0\.s, z1\.b, \1\.b\[1\]
+** ldr d8, \[sp\], 32
+** ret
+*/
+TEST_DUAL_LANE_REG (mlalltt_lane_z8_f32, svfloat32_t, svmfloat8_t, z8,
+ z0 = svmlalltt_lane_f32_mf8_fpm (z0, z1, z8, 1, fpm0),
+ z0 = svmlalltt_lane_fpm (z0, z1, z8, 1, fpm0))
+
+/*
+** mlalltt_lane_z16_f32:
+** ...
+** msr fpmr, x0
+** mov (z[0-7])\.d, z16\.d
+** fmlalltt z0\.s, z1\.b, \1\.b\[15\]
+** ...
+** ret
+*/
+TEST_DUAL_LANE_REG (mlalltt_lane_z16_f32, svfloat32_t, svmfloat8_t, z16,
+ z0 = svmlalltt_lane_f32_mf8_fpm (z0, z1, z16, 15, fpm0),
+ z0 = svmlalltt_lane_fpm (z0, z1, z16, 15, fpm0))
new file mode 100644
@@ -0,0 +1,78 @@
+/* { dg-do assemble { target aarch64_asm_fp8fma_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_fp8fma_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+fp8fma"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+ssve-fp8fma"
+#endif
+
+/*
+** mlalltt_f32_mf8_tied1:
+** msr fpmr, x0
+** fmlalltt z0\.s, z4\.b, z5\.b
+** ret
+*/
+TEST_DUAL_Z (mlalltt_f32_mf8_tied1, svfloat32_t, svmfloat8_t,
+ z0 = svmlalltt_f32_mf8_fpm (z0, z4, z5, fpm0),
+ z0 = svmlalltt_fpm (z0, z4, z5, fpm0))
+
+/*
+** mlalltt_f32_mf8_tied2:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalltt z0\.s, \1\.b, z1\.b
+** ret
+*/
+TEST_DUAL_Z_REV (mlalltt_f32_mf8_tied2, svfloat32_t, svmfloat8_t,
+ z0_res = svmlalltt_f32_mf8_fpm (z4, z0, z1, fpm0),
+ z0_res = svmlalltt_fpm (z4, z0, z1, fpm0))
+
+/*
+** mlalltt_f32_mf8_tied3:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalltt z0\.s, z1\.b, \1\.b
+** ret
+*/
+TEST_DUAL_Z_REV (mlalltt_f32_mf8_tied3, svfloat32_t, svmfloat8_t,
+ z0_res = svmlalltt_f32_mf8_fpm (z4, z1, z0, fpm0),
+ z0_res = svmlalltt_fpm (z4, z1, z0, fpm0))
+
+/*
+** mlalltt_f32_mf8_untied:
+** msr fpmr, x0
+** movprfx z0, z1
+** fmlalltt z0\.s, z4\.b, z5\.b
+** ret
+*/
+TEST_DUAL_Z (mlalltt_f32_mf8_untied, svfloat32_t, svmfloat8_t,
+ z0 = svmlalltt_f32_mf8_fpm (z1, z4, z5, fpm0),
+ z0 = svmlalltt_fpm (z1, z4, z5, fpm0))
+
+/*
+** mlalb_h7_f16_tied1:
+** msr fpmr, x0
+** mov (z[0-9]+\.b), b7
+** fmlalltt z0\.s, z4\.b, \1
+** ret
+*/
+TEST_DUAL_ZD (mlalb_h7_f16_tied1, svfloat32_t, svmfloat8_t, mfloat8_t,
+ z0 = svmlalltt_n_f32_mf8_fpm (z0, z4, d7, fpm0),
+ z0 = svmlalltt_fpm (z0, z4, d7, fpm0))
+
+/*
+** mlalb_h7_f16_untied:
+** msr fpmr, x0
+** mov (z[0-9]+\.b), b7
+** movprfx z0, z1
+** fmlalltt z0\.s, z4\.b, \1
+** ret
+*/
+TEST_DUAL_ZD (mlalb_h7_f16_untied, svfloat32_t, svmfloat8_t, mfloat8_t,
+ z0 = svmlalltt_n_f32_mf8_fpm (z1, z4, d7, fpm0),
+ z0 = svmlalltt_fpm (z1, z4, d7, fpm0))
new file mode 100644
@@ -0,0 +1,91 @@
+/* { dg-do assemble { target aarch64_asm_fp8fma_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_fp8fma_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+fp8fma"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+ssve-fp8fma"
+#endif
+
+/*
+** mlalt_lane_0_f16_tied1:
+** msr fpmr, x0
+** fmlalt z0\.h, z4\.b, z5\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z (mlalt_lane_0_f16_tied1, svfloat16_t, svmfloat8_t,
+ z0 = svmlalt_lane_f16_mf8_fpm (z0, z4, z5, 0, fpm0),
+ z0 = svmlalt_lane_fpm (z0, z4, z5, 0, fpm0))
+
+/*
+** mlalt_lane_0_f16_tied2:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalt z0\.h, \1\.b, z1\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z_REV (mlalt_lane_0_f16_tied2, svfloat16_t, svmfloat8_t,
+ z0_res = svmlalt_lane_f16_mf8_fpm (z4, z0, z1, 0, fpm0),
+ z0_res = svmlalt_lane_fpm (z4, z0, z1, 0, fpm0))
+
+/*
+** mlalt_lane_0_f16_tied3:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalt z0\.h, z1\.b, \1\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z_REV (mlalt_lane_0_f16_tied3, svfloat16_t, svmfloat8_t,
+ z0_res = svmlalt_lane_f16_mf8_fpm (z4, z1, z0, 0, fpm0),
+ z0_res = svmlalt_lane_fpm (z4, z1, z0, 0, fpm0))
+
+/*
+** mlalt_lane_0_f16_untied:
+** msr fpmr, x0
+** movprfx z0, z1
+** fmlalt z0\.h, z4\.b, z5\.b\[0\]
+** ret
+*/
+TEST_DUAL_Z (mlalt_lane_0_f16_untied, svfloat16_t, svmfloat8_t,
+ z0 = svmlalt_lane_f16_mf8_fpm (z1, z4, z5, 0, fpm0),
+ z0 = svmlalt_lane_fpm (z1, z4, z5, 0, fpm0))
+
+/*
+** mlalt_lane_1_f16:
+** msr fpmr, x0
+** fmlalt z0\.h, z4\.b, z5\.b\[1\]
+** ret
+*/
+TEST_DUAL_Z (mlalt_lane_1_f16, svfloat16_t, svmfloat8_t,
+ z0 = svmlalt_lane_f16_mf8_fpm (z0, z4, z5, 1, fpm0),
+ z0 = svmlalt_lane_fpm (z0, z4, z5, 1, fpm0))
+
+/*
+** mlalt_lane_z8_f16:
+** ...
+** msr fpmr, x0
+** mov (z[0-7])\.d, z8\.d
+** fmlalt z0\.h, z1\.b, \1\.b\[1\]
+** ldr d8, \[sp\], 32
+** ret
+*/
+TEST_DUAL_LANE_REG (mlalt_lane_z8_f16, svfloat16_t, svmfloat8_t, z8,
+ z0 = svmlalt_lane_f16_mf8_fpm (z0, z1, z8, 1, fpm0),
+ z0 = svmlalt_lane_fpm (z0, z1, z8, 1, fpm0))
+
+/*
+** mlalt_lane_z16_f16:
+** ...
+** msr fpmr, x0
+** mov (z[0-7])\.d, z16\.d
+** fmlalt z0\.h, z1\.b, \1\.b\[15\]
+** ...
+** ret
+*/
+TEST_DUAL_LANE_REG (mlalt_lane_z16_f16, svfloat16_t, svmfloat8_t, z16,
+ z0 = svmlalt_lane_f16_mf8_fpm (z0, z1, z16, 15, fpm0),
+ z0 = svmlalt_lane_fpm (z0, z1, z16, 15, fpm0))
new file mode 100644
@@ -0,0 +1,78 @@
+/* { dg-do assemble { target aarch64_asm_fp8fma_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_fp8fma_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+fp8fma"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+ssve-fp8fma"
+#endif
+
+/*
+** mlalt_f16_mf8_tied1:
+** msr fpmr, x0
+** fmlalt z0\.h, z4\.b, z5\.b
+** ret
+*/
+TEST_DUAL_Z (mlalt_f16_mf8_tied1, svfloat16_t, svmfloat8_t,
+ z0 = svmlalt_f16_mf8_fpm (z0, z4, z5, fpm0),
+ z0 = svmlalt_fpm (z0, z4, z5, fpm0))
+
+/*
+** mlalt_f16_mf8_tied2:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalt z0\.h, \1\.b, z1\.b
+** ret
+*/
+TEST_DUAL_Z_REV (mlalt_f16_mf8_tied2, svfloat16_t, svmfloat8_t,
+ z0_res = svmlalt_f16_mf8_fpm (z4, z0, z1, fpm0),
+ z0_res = svmlalt_fpm (z4, z0, z1, fpm0))
+
+/*
+** mlalt_f16_mf8_tied3:
+** msr fpmr, x0
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** fmlalt z0\.h, z1\.b, \1\.b
+** ret
+*/
+TEST_DUAL_Z_REV (mlalt_f16_mf8_tied3, svfloat16_t, svmfloat8_t,
+ z0_res = svmlalt_f16_mf8_fpm (z4, z1, z0, fpm0),
+ z0_res = svmlalt_fpm (z4, z1, z0, fpm0))
+
+/*
+** mlalt_f16_mf8_untied:
+** msr fpmr, x0
+** movprfx z0, z1
+** fmlalt z0\.h, z4\.b, z5\.b
+** ret
+*/
+TEST_DUAL_Z (mlalt_f16_mf8_untied, svfloat16_t, svmfloat8_t,
+ z0 = svmlalt_f16_mf8_fpm (z1, z4, z5, fpm0),
+ z0 = svmlalt_fpm (z1, z4, z5, fpm0))
+
+/*
+** mlalt_h7_f16_tied1:
+** msr fpmr, x0
+** mov (z[0-9]+\.b), b7
+** fmlalt z0\.h, z4\.b, \1
+** ret
+*/
+TEST_DUAL_ZD (mlalt_h7_f16_tied1, svfloat16_t, svmfloat8_t, mfloat8_t,
+ z0 = svmlalt_n_f16_mf8_fpm (z0, z4, d7, fpm0),
+ z0 = svmlalt_fpm (z0, z4, d7, fpm0))
+
+/*
+** mlalt_h7_f16_untied:
+** msr fpmr, x0
+** mov (z[0-9]+\.b), b7
+** movprfx z0, z1
+** fmlalt z0\.h, z4\.b, \1
+** ret
+*/
+TEST_DUAL_ZD (mlalt_h7_f16_untied, svfloat16_t, svmfloat8_t, mfloat8_t,
+ z0 = svmlalt_n_f16_mf8_fpm (z1, z4, d7, fpm0),
+ z0 = svmlalt_fpm (z1, z4, d7, fpm0))
@@ -12140,7 +12140,8 @@ proc check_effective_target_aarch64_tiny { } {
foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve"
"i8mm" "f32mm" "f64mm" "bf16" "sb" "sve2" "ls64"
"sme" "sme-i16i64" "sme2" "sve-b16b16"
- "sme-b16b16" "sme-f16f16" "sme2p1" "fp8" } {
+ "sme-b16b16" "sme-f16f16" "sme2p1" "fp8" "fp8fma"
+ "ssve-fp8fma" } {
eval [string map [list FUNC $aarch64_ext] {
proc check_effective_target_aarch64_asm_FUNC_ok { } {
if { [istarget aarch64*-*-*] } {