@@ -133,7 +133,7 @@ (define_insn "*aarch64_simd_mov<VDMOV:mode>"
return "mov\t%0.<Vbtype>, %1.<Vbtype>";
return "fmov\t%d0, %d1";
case 4:
- if (TARGET_SIMD)
+ if (TARGET_BASE_SIMD)
return "umov\t%0, %1.d[0]";
return "fmov\t%x0, %d1";
case 5: return "fmov\t%d0, %1";
@@ -152,9 +152,9 @@ (define_insn "*aarch64_simd_mov<VDMOV:mode>"
(define_insn "*aarch64_simd_mov<VQMOV:mode>"
[(set (match_operand:VQMOV 0 "nonimmediate_operand"
- "=w, Umn, m, w, ?r, ?w, ?r, w, w")
+ "=w, Umn, m, w, w, ?r, ?w, ?r, w, w")
(match_operand:VQMOV 1 "general_operand"
- "m, Dz, w, w, w, r, r, Dn, Dz"))]
+ "m, Dz, w, w, w, w, r, r, Dn, Dz"))]
"TARGET_FLOAT
&& (register_operand (operands[0], <MODE>mode)
|| aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
@@ -170,22 +170,24 @@ (define_insn "*aarch64_simd_mov<VQMOV:mode>"
case 3:
return "mov\t%0.<Vbtype>, %1.<Vbtype>";
case 4:
+ return "mov\t%Z0.d, %Z1.d";
case 5:
case 6:
- return "#";
case 7:
- return aarch64_output_simd_mov_immediate (operands[1], 128);
+ return "#";
case 8:
+ return aarch64_output_simd_mov_immediate (operands[1], 128);
+ case 9:
return "fmov\t%d0, xzr";
default:
gcc_unreachable ();
}
}
[(set_attr "type" "neon_load1_1reg<q>, store_16, neon_store1_1reg<q>,\
- neon_logic<q>, multiple, multiple,\
- multiple, neon_move<q>, fmov")
- (set_attr "length" "4,4,4,4,8,8,8,4,4")
- (set_attr "arch" "*,*,*,simd,*,*,*,simd,*")]
+ neon_logic<q>, *, multiple, multiple,\
+ multiple, neon_move<q>, f_mcr")
+ (set_attr "length" "4,4,4,4,4,8,8,8,4,4")
+ (set_attr "arch" "*,*,*,simd,sve,*,*,*,simd,*")]
)
;; When storing lane zero we can use the normal STR and its more permissive
@@ -195,7 +197,7 @@ (define_insn "aarch64_store_lane0<mode>"
[(set (match_operand:<VEL> 0 "memory_operand" "=m")
(vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand" "w")
(parallel [(match_operand 2 "const_int_operand" "n")])))]
- "TARGET_SIMD
+ "TARGET_FLOAT
&& ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0"
"str\\t%<Vetype>1, %0"
[(set_attr "type" "neon_store1_1reg<q>")]
@@ -353,35 +355,38 @@ (define_expand "aarch64_get_high<mode>"
)
(define_insn_and_split "aarch64_simd_mov_from_<mode>low"
- [(set (match_operand:<VHALF> 0 "register_operand" "=w,?r")
+ [(set (match_operand:<VHALF> 0 "register_operand" "=w,?r,?r")
(vec_select:<VHALF>
- (match_operand:VQMOV_NO2E 1 "register_operand" "w,w")
+ (match_operand:VQMOV_NO2E 1 "register_operand" "w,w,w")
(match_operand:VQMOV_NO2E 2 "vect_par_cnst_lo_half" "")))]
- "TARGET_SIMD"
+ "TARGET_FLOAT"
"@
#
- umov\t%0, %1.d[0]"
+ umov\t%0, %1.d[0]
+ fmov\t%0, %d1"
"&& reload_completed && aarch64_simd_register (operands[0], <VHALF>mode)"
[(set (match_dup 0) (match_dup 1))]
{
operands[1] = aarch64_replace_reg_mode (operands[1], <VHALF>mode);
}
- [(set_attr "type" "mov_reg,neon_to_gp<q>")
+ [(set_attr "type" "mov_reg,neon_to_gp<q>,f_mrc")
+ (set_attr "arch" "simd,base_simd,*")
(set_attr "length" "4")]
)
(define_insn "aarch64_simd_mov_from_<mode>high"
- [(set (match_operand:<VHALF> 0 "register_operand" "=w,?r,?r")
+ [(set (match_operand:<VHALF> 0 "register_operand" "=w,w,?r,?r")
(vec_select:<VHALF>
- (match_operand:VQMOV_NO2E 1 "register_operand" "w,w,w")
+ (match_operand:VQMOV_NO2E 1 "register_operand" "w,0,w,w")
(match_operand:VQMOV_NO2E 2 "vect_par_cnst_hi_half" "")))]
"TARGET_FLOAT"
"@
dup\t%d0, %1.d[1]
+ ext\t%Z0.b, %Z0.b, %Z0.b, #8
umov\t%0, %1.d[1]
fmov\t%0, %1.d[1]"
- [(set_attr "type" "neon_dup<q>,neon_to_gp<q>,f_mrc")
- (set_attr "arch" "simd,simd,*")
+ [(set_attr "type" "neon_dup<q>,*,neon_to_gp<q>,f_mrc")
+ (set_attr "arch" "simd,sve,simd,*")
(set_attr "length" "4")]
)
@@ -3726,7 +3726,7 @@ static bool
aarch64_array_mode_supported_p (machine_mode mode,
unsigned HOST_WIDE_INT nelems)
{
- if (TARGET_SIMD
+ if (TARGET_BASE_SIMD
&& (AARCH64_VALID_SIMD_QREG_MODE (mode)
|| AARCH64_VALID_SIMD_DREG_MODE (mode))
&& (nelems >= 2 && nelems <= 4))
@@ -11876,6 +11876,10 @@ sizetochar (int size)
'N': Take the duplicated element in a vector constant
and print the negative of it in decimal.
'b/h/s/d/q': Print a scalar FP/SIMD register name.
+ 'Z': Same for SVE registers. ('z' was already taken.)
+ Note that it is not necessary to use %Z for operands
+ that have SVE modes. The convention is to use %Z
+ only for non-SVE (or potentially non-SVE) modes.
'S/T/U/V': Print a FP/SIMD register name for a register list.
The register printed is the FP/SIMD register name
of X + 0/1/2/3 for S/T/U/V.
@@ -12048,6 +12052,8 @@ aarch64_print_operand (FILE *f, rtx x, int code)
case 's':
case 'd':
case 'q':
+ case 'Z':
+ code = TOLOWER (code);
if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
{
output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
@@ -12702,8 +12708,8 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
return NO_REGS;
}
- /* Without the TARGET_SIMD instructions we cannot move a Q register
- to a Q register directly. We need a scratch. */
+ /* Without the TARGET_SIMD or TARGET_SVE instructions we cannot move a
+ Q register to a Q register directly. We need a scratch. */
if (REG_P (x)
&& (mode == TFmode
|| mode == TImode
@@ -15273,7 +15279,7 @@ aarch64_register_move_cost (machine_mode mode,
secondary reload. A general register is used as a scratch to move
the upper DI value and the lower DI value is moved directly,
hence the cost is the sum of three moves. */
- if (! TARGET_SIMD)
+ if (!TARGET_SIMD && !TARGET_SVE)
return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
return regmove_cost->FP2FP;
@@ -20773,7 +20779,7 @@ aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
return aarch64_full_sve_mode (mode).else_mode (word_mode);
gcc_assert (known_eq (width, 64) || known_eq (width, 128));
- if (TARGET_SIMD)
+ if (TARGET_BASE_SIMD)
{
if (known_eq (width, 128))
return aarch64_vq_mode (mode).else_mode (word_mode);
@@ -24908,7 +24914,11 @@ aarch64_expand_cpymem (rtx *operands)
int copy_bits = 256;
/* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
- support or slow 256-bit LDP/STP fall back to 128-bit chunks. */
+ support or slow 256-bit LDP/STP fall back to 128-bit chunks.
+
+ ??? Although it would be possible to use LDP/STP Qn in streaming mode
+ (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
+ whether that would improve performance. */
if (size <= 24
|| !TARGET_SIMD
|| (aarch64_tune_params.extra_tuning_flags
@@ -61,8 +61,15 @@
#define WORDS_BIG_ENDIAN (BYTES_BIG_ENDIAN)
/* AdvSIMD is supported in the default configuration, unless disabled by
- -mgeneral-regs-only or by the +nosimd extension. */
-#define TARGET_SIMD (AARCH64_ISA_SIMD)
+ -mgeneral-regs-only or by the +nosimd extension. The set of available
+ instructions is then subdivided into:
+
+ - the "base" set, available both in SME streaming mode and in
+ non-streaming mode
+
+ - the full set, available only in non-streaming mode. */
+#define TARGET_BASE_SIMD (AARCH64_ISA_SIMD)
+#define TARGET_SIMD (AARCH64_ISA_SIMD && AARCH64_ISA_SM_OFF)
#define TARGET_FLOAT (AARCH64_ISA_FP)
#define UNITS_PER_WORD 8
@@ -199,6 +206,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
/* Macros to test ISA flags. */
+#define AARCH64_ISA_SM_OFF (aarch64_isa_flags & AARCH64_FL_SM_OFF)
#define AARCH64_ISA_MODE (aarch64_isa_flags & AARCH64_FL_ISA_MODES)
#define AARCH64_ISA_CRC (aarch64_isa_flags & AARCH64_FL_CRC)
#define AARCH64_ISA_CRYPTO (aarch64_isa_flags & AARCH64_FL_CRYPTO)
@@ -374,7 +374,7 @@ (define_constants
;; As a convenience, "fp_q" means "fp" + the ability to move between
;; Q registers and is equivalent to "simd".
-(define_enum "arches" [ any rcpc8_4 fp fp_q simd sve fp16])
+(define_enum "arches" [any rcpc8_4 fp fp_q base_simd simd sve fp16])
(define_enum_attr "arch" "arches" (const_string "any"))
@@ -402,6 +402,9 @@ (define_attr "arch_enabled" "no,yes"
(and (eq_attr "arch" "fp")
(match_test "TARGET_FLOAT"))
+ (and (eq_attr "arch" "base_simd")
+ (match_test "TARGET_BASE_SIMD"))
+
(and (eq_attr "arch" "fp_q, simd")
(match_test "TARGET_SIMD"))
@@ -1215,7 +1218,7 @@ (define_insn "*mov<mode>_aarch64"
case 8:
return "str\t%<size>1, %0";
case 9:
- return TARGET_SIMD ? "umov\t%w0, %1.<v>[0]" : "fmov\t%w0, %s1";
+ return TARGET_BASE_SIMD ? "umov\t%w0, %1.<v>[0]" : "fmov\t%w0, %s1";
case 10:
return TARGET_SIMD ? "dup\t%0.<Vallxd>, %w1" : "fmov\t%s0, %w1";
case 11:
@@ -1395,9 +1398,9 @@ (define_expand "movti"
(define_insn "*movti_aarch64"
[(set (match_operand:TI 0
- "nonimmediate_operand" "= r,w,w,w, r,w,r,m,m,w,m")
+ "nonimmediate_operand" "= r,w,w,w, r,w,w,r,m,m,w,m")
(match_operand:TI 1
- "aarch64_movti_operand" " rUti,Z,Z,r, w,w,m,r,Z,m,w"))]
+ "aarch64_movti_operand" " rUti,Z,Z,r, w,w,w,m,r,Z,m,w"))]
"(register_operand (operands[0], TImode)
|| aarch64_reg_or_zero (operands[1], TImode))"
"@
@@ -1407,16 +1410,17 @@ (define_insn "*movti_aarch64"
#
#
mov\\t%0.16b, %1.16b
+ mov\\t%Z0.d, %Z1.d
ldp\\t%0, %H0, %1
stp\\t%1, %H1, %0
stp\\txzr, xzr, %0
ldr\\t%q0, %1
str\\t%q1, %0"
- [(set_attr "type" "multiple,neon_move,f_mcr,f_mcr,f_mrc,neon_logic_q, \
+ [(set_attr "type" "multiple,neon_move,f_mcr,f_mcr,f_mrc,neon_logic_q,*,\
load_16,store_16,store_16,\
load_16,store_16")
- (set_attr "length" "8,4,4,8,8,4,4,4,4,4,4")
- (set_attr "arch" "*,simd,*,*,*,simd,*,*,*,fp,fp")]
+ (set_attr "length" "8,4,4,8,8,4,4,4,4,4,4,4")
+ (set_attr "arch" "*,simd,*,*,*,simd,sve,*,*,*,fp,fp")]
)
;; Split a TImode register-register or register-immediate move into
@@ -1552,13 +1556,14 @@ (define_split
(define_insn "*mov<mode>_aarch64"
[(set (match_operand:TFD 0
- "nonimmediate_operand" "=w,?r ,w ,?r,w,?w,w,m,?r,m ,m")
+ "nonimmediate_operand" "=w,w,?r ,w ,?r,w,?w,w,m,?r,m ,m")
(match_operand:TFD 1
- "general_operand" " w,?rY,?r,w ,Y,Y ,m,w,m ,?r,Y"))]
+ "general_operand" " w,w,?rY,?r,w ,Y,Y ,m,w,m ,?r,Y"))]
"TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
|| aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
"@
mov\\t%0.16b, %1.16b
+ mov\\t%Z0.d, %Z1.d
#
#
#
@@ -1569,10 +1574,10 @@ (define_insn "*mov<mode>_aarch64"
ldp\\t%0, %H0, %1
stp\\t%1, %H1, %0
stp\\txzr, xzr, %0"
- [(set_attr "type" "logic_reg,multiple,f_mcr,f_mrc,neon_move_q,f_mcr,\
+ [(set_attr "type" "logic_reg,*,multiple,f_mcr,f_mrc,neon_move_q,f_mcr,\
f_loadd,f_stored,load_16,store_16,store_16")
- (set_attr "length" "4,8,8,8,4,4,4,4,4,4,4")
- (set_attr "arch" "simd,*,*,*,simd,*,*,*,*,*,*")]
+ (set_attr "length" "4,4,8,8,8,4,4,4,4,4,4,4")
+ (set_attr "arch" "simd,sve,*,*,*,simd,*,*,*,*,*,*")]
)
(define_split
@@ -1756,7 +1761,7 @@ (define_insn "load_pair_dw_tftf"
(match_operand:TF 1 "aarch64_mem_pair_operand" "Ump"))
(set (match_operand:TF 2 "register_operand" "=w")
(match_operand:TF 3 "memory_operand" "m"))]
- "TARGET_SIMD
+ "TARGET_BASE_SIMD
&& rtx_equal_p (XEXP (operands[3], 0),
plus_constant (Pmode,
XEXP (operands[1], 0),
@@ -1806,11 +1811,11 @@ (define_insn "store_pair_dw_tftf"
(match_operand:TF 1 "register_operand" "w"))
(set (match_operand:TF 2 "memory_operand" "=m")
(match_operand:TF 3 "register_operand" "w"))]
- "TARGET_SIMD &&
- rtx_equal_p (XEXP (operands[2], 0),
- plus_constant (Pmode,
- XEXP (operands[0], 0),
- GET_MODE_SIZE (TFmode)))"
+ "TARGET_BASE_SIMD
+ && rtx_equal_p (XEXP (operands[2], 0),
+ plus_constant (Pmode,
+ XEXP (operands[0], 0),
+ GET_MODE_SIZE (TFmode)))"
"stp\\t%q1, %q3, %z0"
[(set_attr "type" "neon_stp_q")
(set_attr "fp" "yes")]
@@ -1858,7 +1863,7 @@ (define_insn "loadwb_pair<TX:mode>_<P:mode>"
(set (match_operand:TX 3 "register_operand" "=w")
(mem:TX (plus:P (match_dup 1)
(match_operand:P 5 "const_int_operand" "n"))))])]
- "TARGET_SIMD && INTVAL (operands[5]) == GET_MODE_SIZE (<TX:MODE>mode)"
+ "TARGET_BASE_SIMD && INTVAL (operands[5]) == GET_MODE_SIZE (<TX:MODE>mode)"
"ldp\\t%q2, %q3, [%1], %4"
[(set_attr "type" "neon_ldp_q")]
)
@@ -1908,7 +1913,7 @@ (define_insn "storewb_pair<TX:mode>_<P:mode>"
(set (mem:TX (plus:P (match_dup 0)
(match_operand:P 5 "const_int_operand" "n")))
(match_operand:TX 3 "register_operand" "w"))])]
- "TARGET_SIMD
+ "TARGET_BASE_SIMD
&& INTVAL (operands[5])
== INTVAL (operands[4]) + GET_MODE_SIZE (<TX:MODE>mode)"
"stp\\t%q2, %q3, [%0, %4]!"
new file mode 100644
@@ -0,0 +1,51 @@
+/* { dg-do assemble } */
+/* { dg-options "-O --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** fpr_to_fpr:
+** fmov d0, d1
+** ret
+*/
+double __attribute__((arm_streaming_compatible))
+fpr_to_fpr (double q0, double q1)
+{
+ return q1;
+}
+
+/*
+** gpr_to_fpr:
+** fmov d0, x0
+** ret
+*/
+double __attribute__((arm_streaming_compatible))
+gpr_to_fpr ()
+{
+ register double x0 asm ("x0");
+ asm volatile ("" : "=r" (x0));
+ return x0;
+}
+
+/*
+** zero_to_fpr:
+** fmov d0, xzr
+** ret
+*/
+double __attribute__((arm_streaming_compatible))
+zero_to_fpr ()
+{
+ return 0;
+}
+
+/*
+** fpr_to_gpr:
+** fmov x0, d0
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+fpr_to_gpr (double q0)
+{
+ register double x0 asm ("x0");
+ x0 = q0;
+ asm volatile ("" :: "r" (x0));
+}
new file mode 100644
@@ -0,0 +1,59 @@
+/* { dg-do assemble } */
+/* { dg-options "-O --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdint.h>
+
+/*
+** fpr_to_fpr:
+** fmov d0, d1
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+fpr_to_fpr (void)
+{
+ register uint64_t q0 asm ("q0");
+ register uint64_t q1 asm ("q1");
+ asm volatile ("" : "=w" (q1));
+ q0 = q1;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** gpr_to_fpr:
+** fmov d0, x0
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+gpr_to_fpr (uint64_t x0)
+{
+ register uint64_t q0 asm ("q0");
+ q0 = x0;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** zero_to_fpr:
+** fmov d0, xzr
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+zero_to_fpr ()
+{
+ register uint64_t q0 asm ("q0");
+ q0 = 0;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** fpr_to_gpr:
+** fmov x0, d0
+** ret
+*/
+uint64_t __attribute__((arm_streaming_compatible))
+fpr_to_gpr ()
+{
+ register uint64_t q0 asm ("q0");
+ asm volatile ("" : "=w" (q0));
+ return q0;
+}
new file mode 100644
@@ -0,0 +1,53 @@
+/* { dg-do assemble } */
+/* { dg-options "-O --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target "+nothing+simd"
+
+/*
+** fpr_to_fpr:
+** fmov s0, s1
+** ret
+*/
+_Float16 __attribute__((arm_streaming_compatible))
+fpr_to_fpr (_Float16 q0, _Float16 q1)
+{
+ return q1;
+}
+
+/*
+** gpr_to_fpr:
+** fmov s0, w0
+** ret
+*/
+_Float16 __attribute__((arm_streaming_compatible))
+gpr_to_fpr ()
+{
+ register _Float16 w0 asm ("w0");
+ asm volatile ("" : "=r" (w0));
+ return w0;
+}
+
+/*
+** zero_to_fpr:
+** fmov s0, wzr
+** ret
+*/
+_Float16 __attribute__((arm_streaming_compatible))
+zero_to_fpr ()
+{
+ return 0;
+}
+
+/*
+** fpr_to_gpr:
+** fmov w0, s0
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+fpr_to_gpr (_Float16 q0)
+{
+ register _Float16 w0 asm ("w0");
+ w0 = q0;
+ asm volatile ("" :: "r" (w0));
+}
new file mode 100644
@@ -0,0 +1,61 @@
+/* { dg-do assemble } */
+/* { dg-options "-O --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target "+nothing+simd"
+
+#include <stdint.h>
+
+/*
+** fpr_to_fpr:
+** fmov s0, s1
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+fpr_to_fpr (void)
+{
+ register uint16_t q0 asm ("q0");
+ register uint16_t q1 asm ("q1");
+ asm volatile ("" : "=w" (q1));
+ q0 = q1;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** gpr_to_fpr:
+** fmov s0, w0
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+gpr_to_fpr (uint16_t w0)
+{
+ register uint16_t q0 asm ("q0");
+ q0 = w0;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** zero_to_fpr:
+** fmov s0, wzr
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+zero_to_fpr ()
+{
+ register uint16_t q0 asm ("q0");
+ q0 = 0;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** fpr_to_gpr:
+** umov w0, v0.h\[0\]
+** ret
+*/
+uint16_t __attribute__((arm_streaming_compatible))
+fpr_to_gpr ()
+{
+ register uint16_t q0 asm ("q0");
+ asm volatile ("" : "=w" (q0));
+ return q0;
+}
new file mode 100644
@@ -0,0 +1,59 @@
+/* { dg-do assemble } */
+/* { dg-options "-O --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdint.h>
+
+/*
+** fpr_to_fpr:
+** fmov s0, s1
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+fpr_to_fpr (void)
+{
+ register uint8_t q0 asm ("q0");
+ register uint8_t q1 asm ("q1");
+ asm volatile ("" : "=w" (q1));
+ q0 = q1;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** gpr_to_fpr:
+** fmov s0, w0
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+gpr_to_fpr (uint8_t w0)
+{
+ register uint8_t q0 asm ("q0");
+ q0 = w0;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** zero_to_fpr:
+** fmov s0, wzr
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+zero_to_fpr ()
+{
+ register uint8_t q0 asm ("q0");
+ q0 = 0;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** fpr_to_gpr:
+** umov w0, v0.b\[0\]
+** ret
+*/
+uint8_t __attribute__((arm_streaming_compatible))
+fpr_to_gpr ()
+{
+ register uint8_t q0 asm ("q0");
+ asm volatile ("" : "=w" (q0));
+ return q0;
+}
new file mode 100644
@@ -0,0 +1,51 @@
+/* { dg-do assemble } */
+/* { dg-options "-O --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** fpr_to_fpr:
+** fmov s0, s1
+** ret
+*/
+float __attribute__((arm_streaming_compatible))
+fpr_to_fpr (float q0, float q1)
+{
+ return q1;
+}
+
+/*
+** gpr_to_fpr:
+** fmov s0, w0
+** ret
+*/
+float __attribute__((arm_streaming_compatible))
+gpr_to_fpr ()
+{
+ register float w0 asm ("w0");
+ asm volatile ("" : "=r" (w0));
+ return w0;
+}
+
+/*
+** zero_to_fpr:
+** fmov s0, wzr
+** ret
+*/
+float __attribute__((arm_streaming_compatible))
+zero_to_fpr ()
+{
+ return 0;
+}
+
+/*
+** fpr_to_gpr:
+** fmov w0, s0
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+fpr_to_gpr (float q0)
+{
+ register float w0 asm ("w0");
+ w0 = q0;
+ asm volatile ("" :: "r" (w0));
+}
new file mode 100644
@@ -0,0 +1,59 @@
+/* { dg-do assemble } */
+/* { dg-options "-O --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdint.h>
+
+/*
+** fpr_to_fpr:
+** fmov s0, s1
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+fpr_to_fpr (void)
+{
+ register uint32_t q0 asm ("q0");
+ register uint32_t q1 asm ("q1");
+ asm volatile ("" : "=w" (q1));
+ q0 = q1;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** gpr_to_fpr:
+** fmov s0, w0
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+gpr_to_fpr (uint32_t w0)
+{
+ register uint32_t q0 asm ("q0");
+ q0 = w0;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** zero_to_fpr:
+** fmov s0, wzr
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+zero_to_fpr ()
+{
+ register uint32_t q0 asm ("q0");
+ q0 = 0;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** fpr_to_gpr:
+** fmov w0, s0
+** ret
+*/
+uint32_t __attribute__((arm_streaming_compatible))
+fpr_to_gpr ()
+{
+ register uint32_t q0 asm ("q0");
+ asm volatile ("" : "=w" (q0));
+ return q0;
+}
new file mode 100644
@@ -0,0 +1,81 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target large_long_double } */
+/* { dg-options "-O -mtune=neoverse-v1 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target "+nosve"
+
+/*
+** fpr_to_fpr:
+** sub sp, sp, #16
+** str q1, \[sp\]
+** ldr q0, \[sp\]
+** add sp, sp, #?16
+** ret
+*/
+long double __attribute__((arm_streaming_compatible))
+fpr_to_fpr (long double q0, long double q1)
+{
+ return q1;
+}
+
+/*
+** gpr_to_fpr: { target aarch64_little_endian }
+** fmov d0, x0
+** fmov v0.d\[1\], x1
+** ret
+*/
+/*
+** gpr_to_fpr: { target aarch64_big_endian }
+** fmov d0, x1
+** fmov v0.d\[1\], x0
+** ret
+*/
+long double __attribute__((arm_streaming_compatible))
+gpr_to_fpr ()
+{
+ register long double x0 asm ("x0");
+ asm volatile ("" : "=r" (x0));
+ return x0;
+}
+
+/*
+** zero_to_fpr:
+** fmov s0, wzr
+** ret
+*/
+long double __attribute__((arm_streaming_compatible))
+zero_to_fpr ()
+{
+ return 0;
+}
+
+/*
+** fpr_to_gpr: { target aarch64_little_endian }
+** (
+** fmov x0, d0
+** fmov x1, v0.d\[1\]
+** |
+** fmov x1, v0.d\[1\]
+** fmov x0, d0
+** )
+** ret
+*/
+/*
+** fpr_to_gpr: { target aarch64_big_endian }
+** (
+** fmov x1, d0
+** fmov x0, v0.d\[1\]
+** |
+** fmov x0, v0.d\[1\]
+** fmov x1, d0
+** )
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+fpr_to_gpr (long double q0)
+{
+ register long double x0 asm ("x0");
+ x0 = q0;
+ asm volatile ("" :: "r" (x0));
+}
new file mode 100644
@@ -0,0 +1,78 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target large_long_double } */
+/* { dg-options "-O -mtune=neoverse-v1 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target "+sve"
+
+/*
+** fpr_to_fpr:
+** mov z0.d, z1.d
+** ret
+*/
+long double __attribute__((arm_streaming_compatible))
+fpr_to_fpr (long double q0, long double q1)
+{
+ return q1;
+}
+
+/*
+** gpr_to_fpr: { target aarch64_little_endian }
+** fmov d0, x0
+** fmov v0.d\[1\], x1
+** ret
+*/
+/*
+** gpr_to_fpr: { target aarch64_big_endian }
+** fmov d0, x1
+** fmov v0.d\[1\], x0
+** ret
+*/
+long double __attribute__((arm_streaming_compatible))
+gpr_to_fpr ()
+{
+ register long double x0 asm ("x0");
+ asm volatile ("" : "=r" (x0));
+ return x0;
+}
+
+/*
+** zero_to_fpr:
+** fmov s0, wzr
+** ret
+*/
+long double __attribute__((arm_streaming_compatible))
+zero_to_fpr ()
+{
+ return 0;
+}
+
+/*
+** fpr_to_gpr: { target aarch64_little_endian }
+** (
+** fmov x0, d0
+** fmov x1, v0.d\[1\]
+** |
+** fmov x1, v0.d\[1\]
+** fmov x0, d0
+** )
+** ret
+*/
+/*
+** fpr_to_gpr: { target aarch64_big_endian }
+** (
+** fmov x1, d0
+** fmov x0, v0.d\[1\]
+** |
+** fmov x0, v0.d\[1\]
+** fmov x1, d0
+** )
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+fpr_to_gpr (long double q0)
+{
+ register long double x0 asm ("x0");
+ x0 = q0;
+ asm volatile ("" :: "r" (x0));
+}
new file mode 100644
@@ -0,0 +1,86 @@
+/* { dg-do assemble } */
+/* { dg-options "-O -mtune=neoverse-v1 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target "+nosve"
+
+/*
+** fpr_to_fpr:
+** sub sp, sp, #16
+** str q1, \[sp\]
+** ldr q0, \[sp\]
+** add sp, sp, #?16
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+fpr_to_fpr (void)
+{
+ register __int128_t q0 asm ("q0");
+ register __int128_t q1 asm ("q1");
+ asm volatile ("" : "=w" (q1));
+ q0 = q1;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** gpr_to_fpr: { target aarch64_little_endian }
+** fmov d0, x0
+** fmov v0.d\[1\], x1
+** ret
+*/
+/*
+** gpr_to_fpr: { target aarch64_big_endian }
+** fmov d0, x1
+** fmov v0.d\[1\], x0
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+gpr_to_fpr (__int128_t x0)
+{
+ register __int128_t q0 asm ("q0");
+ q0 = x0;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** zero_to_fpr:
+** fmov d0, xzr
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+zero_to_fpr ()
+{
+ register __int128_t q0 asm ("q0");
+ q0 = 0;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** fpr_to_gpr: { target aarch64_little_endian }
+** (
+** fmov x0, d0
+** fmov x1, v0.d\[1\]
+** |
+** fmov x1, v0.d\[1\]
+** fmov x0, d0
+** )
+** ret
+*/
+/*
+** fpr_to_gpr: { target aarch64_big_endian }
+** (
+** fmov x1, d0
+** fmov x0, v0.d\[1\]
+** |
+** fmov x0, v0.d\[1\]
+** fmov x1, d0
+** )
+** ret
+*/
+__int128_t __attribute__((arm_streaming_compatible))
+fpr_to_gpr ()
+{
+ register __int128_t q0 asm ("q0");
+ asm volatile ("" : "=w" (q0));
+ return q0;
+}
new file mode 100644
@@ -0,0 +1,83 @@
+/* { dg-do assemble } */
+/* { dg-options "-O -mtune=neoverse-v1 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target "+sve"
+
+/*
+** fpr_to_fpr:
+** mov z0\.d, z1\.d
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+fpr_to_fpr (void)
+{
+ register __int128_t q0 asm ("q0");
+ register __int128_t q1 asm ("q1");
+ asm volatile ("" : "=w" (q1));
+ q0 = q1;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** gpr_to_fpr: { target aarch64_little_endian }
+** fmov d0, x0
+** fmov v0.d\[1\], x1
+** ret
+*/
+/*
+** gpr_to_fpr: { target aarch64_big_endian }
+** fmov d0, x1
+** fmov v0.d\[1\], x0
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+gpr_to_fpr (__int128_t x0)
+{
+ register __int128_t q0 asm ("q0");
+ q0 = x0;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** zero_to_fpr:
+** fmov d0, xzr
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+zero_to_fpr ()
+{
+ register __int128_t q0 asm ("q0");
+ q0 = 0;
+ asm volatile ("" :: "w" (q0));
+}
+
+/*
+** fpr_to_gpr: { target aarch64_little_endian }
+** (
+** fmov x0, d0
+** fmov x1, v0.d\[1\]
+** |
+** fmov x1, v0.d\[1\]
+** fmov x0, d0
+** )
+** ret
+*/
+/*
+** fpr_to_gpr: { target aarch64_big_endian }
+** (
+** fmov x1, d0
+** fmov x0, v0.d\[1\]
+** |
+** fmov x0, v0.d\[1\]
+** fmov x1, d0
+** )
+** ret
+*/
+__int128_t __attribute__((arm_streaming_compatible))
+fpr_to_gpr ()
+{
+ register __int128_t q0 asm ("q0");
+ asm volatile ("" : "=w" (q0));
+ return q0;
+}
new file mode 100644
@@ -0,0 +1,82 @@
+/* { dg-do assemble } */
+/* { dg-options "-O -mtune=neoverse-v1 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target "+nosve"
+
+typedef unsigned char v16qi __attribute__((vector_size(16)));
+
+/*
+** fpr_to_fpr:
+** sub sp, sp, #16
+** str q1, \[sp\]
+** ldr q0, \[sp\]
+** add sp, sp, #?16
+** ret
+*/
+v16qi __attribute__((arm_streaming_compatible))
+fpr_to_fpr (v16qi q0, v16qi q1)
+{
+ return q1;
+}
+
+/*
+** gpr_to_fpr: { target aarch64_little_endian }
+** fmov d0, x0
+** fmov v0.d\[1\], x1
+** ret
+*/
+/*
+** gpr_to_fpr: { target aarch64_big_endian }
+** fmov d0, x1
+** fmov v0.d\[1\], x0
+** ret
+*/
+v16qi __attribute__((arm_streaming_compatible))
+gpr_to_fpr ()
+{
+ register v16qi x0 asm ("x0");
+ asm volatile ("" : "=r" (x0));
+ return x0;
+}
+
+/*
+** zero_to_fpr:
+** fmov d0, xzr
+** ret
+*/
+v16qi __attribute__((arm_streaming_compatible))
+zero_to_fpr ()
+{
+ return (v16qi) {};
+}
+
+/*
+** fpr_to_gpr: { target aarch64_little_endian }
+** (
+** umov x0, v0.d\[0\]
+** fmov x1, v0.d\[1\]
+** |
+** fmov x1, v0.d\[1\]
+** umov x0, v0.d\[0\]
+** )
+** ret
+*/
+/*
+** fpr_to_gpr: { target aarch64_big_endian }
+** (
+** umov x1, v0.d\[0\]
+** fmov x0, v0.d\[1\]
+** |
+** fmov x0, v0.d\[1\]
+** umov x1, v0.d\[0\]
+** )
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+fpr_to_gpr (v16qi q0)
+{
+ register v16qi x0 asm ("x0");
+ x0 = q0;
+ asm volatile ("" :: "r" (x0));
+}
new file mode 100644
@@ -0,0 +1,79 @@
+/* { dg-do assemble } */
+/* { dg-options "-O -mtune=neoverse-v1 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target "+sve"
+
+typedef unsigned char v16qi __attribute__((vector_size(16)));
+
+/*
+** fpr_to_fpr:
+** mov z0.d, z1.d
+** ret
+*/
+v16qi __attribute__((arm_streaming_compatible))
+fpr_to_fpr (v16qi q0, v16qi q1)
+{
+ return q1;
+}
+
+/*
+** gpr_to_fpr: { target aarch64_little_endian }
+** fmov d0, x0
+** fmov v0.d\[1\], x1
+** ret
+*/
+/*
+** gpr_to_fpr: { target aarch64_big_endian }
+** fmov d0, x1
+** fmov v0.d\[1\], x0
+** ret
+*/
+v16qi __attribute__((arm_streaming_compatible))
+gpr_to_fpr ()
+{
+ register v16qi x0 asm ("x0");
+ asm volatile ("" : "=r" (x0));
+ return x0;
+}
+
+/*
+** zero_to_fpr:
+** fmov d0, xzr
+** ret
+*/
+v16qi __attribute__((arm_streaming_compatible))
+zero_to_fpr ()
+{
+ return (v16qi) {};
+}
+
+/*
+** fpr_to_gpr: { target aarch64_little_endian }
+** (
+** umov x0, v0.d\[0\]
+** fmov x1, v0.d\[1\]
+** |
+** fmov x1, v0.d\[1\]
+** umov x0, v0.d\[0\]
+** )
+** ret
+*/
+/*
+** fpr_to_gpr: { target aarch64_big_endian }
+** (
+** umov x1, v0.d\[0\]
+** fmov x0, v0.d\[1\]
+** |
+** fmov x0, v0.d\[1\]
+** umov x1, v0.d\[0\]
+** )
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+fpr_to_gpr (v16qi q0)
+{
+ register v16qi x0 asm ("x0");
+ x0 = q0;
+ asm volatile ("" :: "r" (x0));
+}
new file mode 100644
@@ -0,0 +1,55 @@
+/* { dg-do assemble } */
+/* { dg-options "-O -mtune=neoverse-v1 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#pragma GCC target "+nosve"
+
+typedef unsigned char v8qi __attribute__((vector_size(8)));
+
+/*
+** fpr_to_fpr:
+** fmov d0, d1
+** ret
+*/
+v8qi __attribute__((arm_streaming_compatible))
+fpr_to_fpr (v8qi q0, v8qi q1)
+{
+ return q1;
+}
+
+/*
+** gpr_to_fpr:
+** fmov d0, x0
+** ret
+*/
+v8qi __attribute__((arm_streaming_compatible))
+gpr_to_fpr ()
+{
+ register v8qi x0 asm ("x0");
+ asm volatile ("" : "=r" (x0));
+ return x0;
+}
+
+/*
+** zero_to_fpr:
+** fmov d0, xzr
+** ret
+*/
+v8qi __attribute__((arm_streaming_compatible))
+zero_to_fpr ()
+{
+ return (v8qi) {};
+}
+
+/*
+** fpr_to_gpr:
+** umov x0, v0\.d\[0\]
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+fpr_to_gpr (v8qi q0)
+{
+ register v8qi x0 asm ("x0");
+ x0 = q0;
+ asm volatile ("" :: "r" (x0));
+}
new file mode 100644
@@ -0,0 +1,13 @@
+// { dg-options "" }
+
+#include <arm_neon.h>
+
+#pragma GCC target "+nosme"
+
+// { dg-error {inlining failed.*'vaddq_s32'} "" { target *-*-* } 0 }
+
+int32x4_t __attribute__((arm_streaming_compatible))
+foo (int32x4_t x, int32x4_t y)
+{
+ return vaddq_s32 (x, y);
+}
new file mode 100644
@@ -0,0 +1,11 @@
+// { dg-options "" }
+
+#include <arm_neon.h>
+
+// { dg-error {inlining failed.*'vaddq_s32'} "" { target *-*-* } 0 }
+
+int32x4_t __attribute__((arm_streaming_compatible))
+foo (int32x4_t x, int32x4_t y)
+{
+ return vaddq_s32 (x, y);
+}
new file mode 100644
@@ -0,0 +1,11 @@
+// { dg-options "" }
+
+#include <arm_neon.h>
+
+// { dg-error {inlining failed.*'vaddq_s32'} "" { target *-*-* } 0 }
+
+int32x4_t __attribute__((arm_streaming))
+foo (int32x4_t x, int32x4_t y)
+{
+ return vaddq_s32 (x, y);
+}