[14/16] aarch64: Add support for arm_locally_streaming

Message ID mptzgcvmasc.fsf@arm.com
State New
Headers
Series aarch64: Add support for SME |

Commit Message

Richard Sandiford Nov. 13, 2022, 10:03 a.m. UTC
  This patch adds support for the arm_locally_streaming attribute,
which allows a function to use SME internally without changing
the function's ABI.  The attribute is valid but redundant for
arm_streaming functions.

gcc/
	* config/aarch64/aarch64.cc (aarch64_attribute_table): Add
	arm_locally_streaming.
	(aarch64_fndecl_is_locally_streaming): New function.
	(aarch64_fndecl_sm_state): Handle arm_locally_streaming functions.
	(aarch64_cfun_enables_pstate_sm): New function.
	(aarch64_add_offset): Add an argument that specifies whether
	the streaming vector length should be used instead of the
	prevailing one.
	(aarch64_split_add_offset, aarch64_add_sp, aarch64_sub_sp): Likewise.
	(aarch64_allocate_and_probe_stack_space): Likewise.
	(aarch64_expand_mov_immediate): Update calls accordingly.
	(aarch64_need_old_pstate_sm): Return true for locally-streaming
	streaming-compatible functions.
	(aarch64_layout_frame): Force all call-preserved Z and P registers
	to be saved and restored if the function switches PSTATE.SM in the
	prologue.
	(aarch64_get_separate_components): Disable shrink-wrapping of
	such Z and P saves and restores.
	(aarch64_use_late_prologue_epilogue): New function.
	(aarch64_expand_prologue): Measure SVE lengths in the streaming
	vector length for locally-streaming functions, then emit code
	to enable streaming mode.  Combine separate SMSTART ZA and
	SMSTART SM instructions into a single SMSTART where possible.
	(aarch64_expand_epilogue): Likewise in reverse.
	(TARGET_USE_LATE_PROLOGUE_EPILOGUE): Define.
	* config/aarch64/aarch64-sme.md (UNSPEC_SMSTART): New unspec.
	(UNSPEC_SMSTOP): Likewise.
	(aarch64_smstart, aarch64_smstop): New patterns.

gcc/testsuite/
	* gcc.target/aarch64/sme/locally_streaming_1.c: New test.
	* gcc.target/aarch64/sme/locally_streaming_2.c: Likewise.
	* gcc.target/aarch64/sme/locally_streaming_3.c: Likewise.
---
 gcc/config/aarch64/aarch64-sme.md             |  82 ++++
 gcc/config/aarch64/aarch64.cc                 | 237 ++++++++--
 .../aarch64/sme/locally_streaming_1.c         | 433 ++++++++++++++++++
 .../aarch64/sme/locally_streaming_2.c         | 177 +++++++
 .../aarch64/sme/locally_streaming_3.c         | 273 +++++++++++
 5 files changed, 1164 insertions(+), 38 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
  

Patch

diff --git a/gcc/config/aarch64/aarch64-sme.md b/gcc/config/aarch64/aarch64-sme.md
index 7b3ccea2e11..70be7adba28 100644
--- a/gcc/config/aarch64/aarch64-sme.md
+++ b/gcc/config/aarch64/aarch64-sme.md
@@ -281,6 +281,88 @@  (define_insn_and_split "aarch64_restore_za"
     DONE;
   }
 )
+
+;; -------------------------------------------------------------------------
+;; ---- Combined PSTATE.SM and PSTATE.ZA management
+;; -------------------------------------------------------------------------
+;; Includes
+;; - SMSTART
+;; - SMSTOP
+;; -------------------------------------------------------------------------
+
+(define_c_enum "unspec" [
+  UNSPEC_SMSTART
+  UNSPEC_SMSTOP
+])
+
+;; Enable SM and ZA, starting with fresh ZA contents.  This is only valid when
+;; SME is present, but the pattern does not depend on TARGET_SME since it can
+;; be used conditionally.
+(define_insn "aarch64_smstart"
+  [(unspec_volatile [(const_int 0)] UNSPEC_SMSTART)
+   (clobber (reg:V4x16QI V0_REGNUM))
+   (clobber (reg:V4x16QI V4_REGNUM))
+   (clobber (reg:V4x16QI V8_REGNUM))
+   (clobber (reg:V4x16QI V12_REGNUM))
+   (clobber (reg:V4x16QI V16_REGNUM))
+   (clobber (reg:V4x16QI V20_REGNUM))
+   (clobber (reg:V4x16QI V24_REGNUM))
+   (clobber (reg:V4x16QI V28_REGNUM))
+   (clobber (reg:VNx16BI P0_REGNUM))
+   (clobber (reg:VNx16BI P1_REGNUM))
+   (clobber (reg:VNx16BI P2_REGNUM))
+   (clobber (reg:VNx16BI P3_REGNUM))
+   (clobber (reg:VNx16BI P4_REGNUM))
+   (clobber (reg:VNx16BI P5_REGNUM))
+   (clobber (reg:VNx16BI P6_REGNUM))
+   (clobber (reg:VNx16BI P7_REGNUM))
+   (clobber (reg:VNx16BI P8_REGNUM))
+   (clobber (reg:VNx16BI P9_REGNUM))
+   (clobber (reg:VNx16BI P10_REGNUM))
+   (clobber (reg:VNx16BI P11_REGNUM))
+   (clobber (reg:VNx16BI P12_REGNUM))
+   (clobber (reg:VNx16BI P13_REGNUM))
+   (clobber (reg:VNx16BI P14_REGNUM))
+   (clobber (reg:VNx16BI P15_REGNUM))
+   (clobber (reg:VNx16QI ZA_REGNUM))]
+  ""
+  "smstart"
+)
+
+;; Disable SM and ZA, and discard its current contents.  This is only valid
+;; when SME is present, but the pattern does not depend on TARGET_SME since
+;; it can be used conditionally.
+(define_insn "aarch64_smstop"
+  [(unspec_volatile [(reg:VNx16QI OLD_ZA_REGNUM)] UNSPEC_SMSTOP)
+   (clobber (reg:V4x16QI V0_REGNUM))
+   (clobber (reg:V4x16QI V4_REGNUM))
+   (clobber (reg:V4x16QI V8_REGNUM))
+   (clobber (reg:V4x16QI V12_REGNUM))
+   (clobber (reg:V4x16QI V16_REGNUM))
+   (clobber (reg:V4x16QI V20_REGNUM))
+   (clobber (reg:V4x16QI V24_REGNUM))
+   (clobber (reg:V4x16QI V28_REGNUM))
+   (clobber (reg:VNx16BI P0_REGNUM))
+   (clobber (reg:VNx16BI P1_REGNUM))
+   (clobber (reg:VNx16BI P2_REGNUM))
+   (clobber (reg:VNx16BI P3_REGNUM))
+   (clobber (reg:VNx16BI P4_REGNUM))
+   (clobber (reg:VNx16BI P5_REGNUM))
+   (clobber (reg:VNx16BI P6_REGNUM))
+   (clobber (reg:VNx16BI P7_REGNUM))
+   (clobber (reg:VNx16BI P8_REGNUM))
+   (clobber (reg:VNx16BI P9_REGNUM))
+   (clobber (reg:VNx16BI P10_REGNUM))
+   (clobber (reg:VNx16BI P11_REGNUM))
+   (clobber (reg:VNx16BI P12_REGNUM))
+   (clobber (reg:VNx16BI P13_REGNUM))
+   (clobber (reg:VNx16BI P14_REGNUM))
+   (clobber (reg:VNx16BI P15_REGNUM))
+   (clobber (reg:VNx16QI ZA_REGNUM))]
+  ""
+  "smstop"
+)
+
 ;; =========================================================================
 ;; == Loads, stores and moves
 ;; =========================================================================
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 966d13abe4c..48bf2de4b3d 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -2790,6 +2790,7 @@  static const struct attribute_spec aarch64_attribute_table[] =
 			  NULL, attr_streaming_exclusions },
   { "arm_streaming_compatible", 0, 0, false, true,  true,  true,
 			  NULL, attr_streaming_exclusions },
+  { "arm_locally_streaming", 0, 0, true, false, false, false, NULL, NULL },
   { "arm_new_za",	  0, 0, true, false, false, false,
 			  handle_arm_new_za_attribute,
 			  attr_arm_new_za_exclusions },
@@ -4162,6 +4163,15 @@  aarch64_fndecl_has_new_za_state (const_tree fndecl)
   return lookup_attribute ("arm_new_za", DECL_ATTRIBUTES (fndecl));
 }
 
+/* Return true if FNDECL uses streaming mode internally, as an
+   implementation choice.  */
+
+static bool
+aarch64_fndecl_is_locally_streaming (const_tree fndecl)
+{
+  return lookup_attribute ("arm_locally_streaming", DECL_ATTRIBUTES (fndecl));
+}
+
 /* Return the state of PSTATE.SM when compiling the body of
    function FNDECL.  This might be different from the state of
    PSTATE.SM on entry.  */
@@ -4169,6 +4179,9 @@  aarch64_fndecl_has_new_za_state (const_tree fndecl)
 static aarch64_feature_flags
 aarch64_fndecl_sm_state (const_tree fndecl)
 {
+  if (aarch64_fndecl_is_locally_streaming (fndecl))
+    return AARCH64_FL_SM_ON;
+
   return aarch64_fntype_sm_state (TREE_TYPE (fndecl));
 }
 
@@ -4222,6 +4235,16 @@  aarch64_cfun_incoming_za_state ()
   return aarch64_fntype_za_state (TREE_TYPE (cfun->decl));
 }
 
+/* Return true if PSTATE.SM is 1 in the body of the current function,
+   but is not guaranteed to be 1 on entry.  */
+
+static bool
+aarch64_cfun_enables_pstate_sm ()
+{
+  return (aarch64_fndecl_is_locally_streaming (cfun->decl)
+	  && aarch64_cfun_incoming_sm_state () != AARCH64_FL_SM_ON);
+}
+
 /* Return true if the current function creates new ZA state (as opposed
    to sharing ZA with its callers or ignoring ZA altogether).  */
 
@@ -6432,6 +6455,10 @@  aarch64_add_offset_temporaries (rtx x)
    TEMP2, if nonnull, is a second temporary register that doesn't
    overlap either DEST or REG.
 
+   FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET
+   is measured relative to the SME vector length instead of the current
+   prevailing vector length.  It is 0 otherwise.
+
    Since this function may be used to adjust the stack pointer, we must
    ensure that it cannot cause transient stack deallocation (for example
    by first incrementing SP and then decrementing when adjusting by a
@@ -6440,6 +6467,7 @@  aarch64_add_offset_temporaries (rtx x)
 static void
 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
 		    poly_int64 offset, rtx temp1, rtx temp2,
+		    aarch64_feature_flags force_isa_mode,
 		    bool frame_related_p, bool emit_move_imm = true)
 {
   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
@@ -6452,9 +6480,17 @@  aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
   /* Try using ADDVL or ADDPL to add the whole value.  */
   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
     {
-      rtx offset_rtx = gen_int_mode (offset, mode);
+      rtx offset_rtx;
+      if (force_isa_mode == 0)
+	offset_rtx = gen_int_mode (offset, mode);
+      else
+	offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
       RTX_FRAME_RELATED_P (insn) = frame_related_p;
+      if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON))
+	add_reg_note (insn, REG_CFA_ADJUST_CFA,
+		      gen_rtx_SET (dest, plus_constant (Pmode, src,
+							offset)));
       return;
     }
 
@@ -6470,11 +6506,19 @@  aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
   if (src != const0_rtx
       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
     {
-      rtx offset_rtx = gen_int_mode (poly_offset, mode);
+      rtx offset_rtx;
+      if (force_isa_mode == 0)
+	offset_rtx = gen_int_mode (poly_offset, mode);
+      else
+	offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
       if (frame_related_p)
 	{
 	  rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
 	  RTX_FRAME_RELATED_P (insn) = true;
+	  if (force_isa_mode & AARCH64_FL_SM_ON)
+	    add_reg_note (insn, REG_CFA_ADJUST_CFA,
+			  gen_rtx_SET (dest, plus_constant (Pmode, src,
+							    poly_offset)));
 	  src = dest;
 	}
       else
@@ -6505,8 +6549,18 @@  aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
       rtx val;
       if (IN_RANGE (rel_factor, -32, 31))
 	{
+	  if (force_isa_mode & AARCH64_FL_SM_ON)
+	    {
+	      /* Try to use an unshifted RDSVL, otherwise fall back on
+		 a shifted RDSVL #1.  */
+	      if (aarch64_sve_rdvl_addvl_factor_p (factor))
+		shift = 0;
+	      else
+		factor = rel_factor * 16;
+	      val = aarch64_sme_vq_immediate (mode, factor, 0);
+	    }
 	  /* Try to use an unshifted CNT[BHWD].  */
-	  if (aarch64_sve_cnt_factor_p (factor))
+	  else if (aarch64_sve_cnt_factor_p (factor))
 	    {
 	      val = gen_int_mode (poly_int64 (factor, factor), mode);
 	      shift = 0;
@@ -6542,12 +6596,19 @@  aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
 	     a shift and add sequence for the multiplication.
 	     If CNTB << SHIFT is out of range, stick with the current
 	     shift factor.  */
-	  if (IN_RANGE (low_bit, 2, 16 * 16))
+	  if (force_isa_mode == 0
+	      && IN_RANGE (low_bit, 2, 16 * 16))
 	    {
 	      val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
 	      shift = 0;
 	    }
-	  else
+	  else if ((force_isa_mode & AARCH64_FL_SM_ON)
+		   && aarch64_sve_rdvl_addvl_factor_p (low_bit))
+	    {
+	      val = aarch64_sme_vq_immediate (mode, low_bit, 0);
+	      shift = 0;
+	    }
+ 	  else
 	    val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
 
 	  val = aarch64_force_temporary (mode, temp1, val);
@@ -6634,30 +6695,34 @@  aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
 			  rtx offset_rtx, rtx temp1, rtx temp2)
 {
   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
-		      temp1, temp2, false);
+		      temp1, temp2, 0, false);
 }
 
 /* Add DELTA to the stack pointer, marking the instructions frame-related.
-   TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
-   if TEMP1 already contains abs (DELTA).  */
+   TEMP1 is available as a temporary if nonnull.  FORCE_ISA_MODE is as
+   for aarch64_add_offset.  EMIT_MOVE_IMM is false if TEMP1 already
+   contains abs (DELTA).  */
 
 static inline void
-aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
+aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
+		aarch64_feature_flags force_isa_mode, bool emit_move_imm)
 {
   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
-		      temp1, temp2, true, emit_move_imm);
+		      temp1, temp2, force_isa_mode, true, emit_move_imm);
 }
 
 /* Subtract DELTA from the stack pointer, marking the instructions
-   frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
-   if nonnull.  */
+   frame-related if FRAME_RELATED_P.  FORCE_ISA_MODE is as for
+   aarch64_add_offset.  TEMP1 is available as a temporary if nonnull.  */
 
 static inline void
-aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
-		bool emit_move_imm = true)
+aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
+		aarch64_feature_flags force_isa_mode,
+		bool frame_related_p, bool emit_move_imm = true)
 {
   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
-		      temp1, temp2, frame_related_p, emit_move_imm);
+		      temp1, temp2, force_isa_mode, frame_related_p,
+		      emit_move_imm);
 }
 
 /* A streaming-compatible function needs to switch temporarily to the known
@@ -7673,11 +7738,11 @@  aarch64_expand_mov_immediate (rtx dest, rtx imm)
 		{
 		  base = aarch64_force_temporary (int_mode, dest, base);
 		  aarch64_add_offset (int_mode, dest, base, offset,
-				      NULL_RTX, NULL_RTX, false);
+				      NULL_RTX, NULL_RTX, 0, false);
 		}
 	      else
 		aarch64_add_offset (int_mode, dest, base, offset,
-				    dest, NULL_RTX, false);
+				    dest, NULL_RTX, 0, false);
 	    }
 	  return;
 	}
@@ -7704,7 +7769,7 @@  aarch64_expand_mov_immediate (rtx dest, rtx imm)
 	      gcc_assert (can_create_pseudo_p ());
 	      base = aarch64_force_temporary (int_mode, dest, base);
 	      aarch64_add_offset (int_mode, dest, base, const_offset,
-				  NULL_RTX, NULL_RTX, false);
+				  NULL_RTX, NULL_RTX, 0, false);
 	      return;
 	    }
 
@@ -7744,7 +7809,7 @@  aarch64_expand_mov_immediate (rtx dest, rtx imm)
 	      gcc_assert(can_create_pseudo_p ());
 	      base = aarch64_force_temporary (int_mode, dest, base);
 	      aarch64_add_offset (int_mode, dest, base, const_offset,
-				  NULL_RTX, NULL_RTX, false);
+				  NULL_RTX, NULL_RTX, 0, false);
 	      return;
 	    }
 	  /* FALLTHRU */
@@ -9212,6 +9277,9 @@  aarch64_need_old_pstate_sm ()
   if (aarch64_cfun_incoming_sm_state () != 0)
     return false;
 
+  if (aarch64_cfun_enables_pstate_sm ())
+    return true;
+
   if (cfun->machine->call_switches_sm_state)
     for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
       if (auto *call = dyn_cast<rtx_call_insn *> (insn))
@@ -9238,6 +9306,7 @@  aarch64_layout_frame (void)
   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
   bool frame_related_fp_reg_p = false;
   aarch64_frame &frame = cfun->machine->frame;
+  bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
 
   frame.emit_frame_chain = aarch64_needs_frame_chain ();
 
@@ -9277,7 +9346,7 @@  aarch64_layout_frame (void)
       frame.reg_offset[regno] = SLOT_REQUIRED;
 
   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
-    if (df_regs_ever_live_p (regno)
+    if ((enables_pstate_sm || df_regs_ever_live_p (regno))
 	&& !fixed_regs[regno]
 	&& !crtl->abi->clobbers_full_reg_p (regno))
       {
@@ -9306,7 +9375,7 @@  aarch64_layout_frame (void)
     }
 
   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
-    if (df_regs_ever_live_p (regno)
+    if ((enables_pstate_sm || df_regs_ever_live_p (regno))
 	&& !fixed_regs[regno]
 	&& !crtl->abi->clobbers_full_reg_p (regno))
       frame.reg_offset[regno] = SLOT_REQUIRED;
@@ -10121,9 +10190,14 @@  aarch64_get_separate_components (void)
   bitmap_clear (components);
 
   /* The registers we need saved to the frame.  */
+  bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
     if (aarch64_register_saved_on_entry (regno))
       {
+	if (enables_pstate_sm
+	    && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
+	  continue;
+
 	/* Punt on saves and restores that use ST1D and LD1D.  We could
 	   try to be smarter, but it would involve making sure that the
 	   spare predicate register itself is safe to use at the save
@@ -10438,6 +10512,7 @@  aarch64_stack_clash_protection_alloca_probe_range (void)
 static void
 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 					poly_int64 poly_size,
+					aarch64_feature_flags force_isa_mode,
 					bool frame_related_p,
 					bool final_adjustment_p)
 {
@@ -10498,7 +10573,8 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
   if (known_lt (poly_size, min_probe_threshold)
       || !flag_stack_clash_protection)
     {
-      aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
+      aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
+		      frame_related_p);
       return;
     }
 
@@ -10515,7 +10591,8 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 
       /* First calculate the amount of bytes we're actually spilling.  */
       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
-			  poly_size, temp1, temp2, false, true);
+			  poly_size, temp1, temp2, force_isa_mode,
+			  false, true);
 
       rtx_insn *insn = get_last_insn ();
 
@@ -10573,7 +10650,7 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
     {
       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
 	{
-	  aarch64_sub_sp (NULL, temp2, guard_size, true);
+	  aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
 	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
 					   guard_used_by_caller));
 	  emit_insn (gen_blockage ());
@@ -10584,7 +10661,7 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
     {
       /* Compute the ending address.  */
       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
-			  temp1, NULL, false, true);
+			  temp1, NULL, force_isa_mode, false, true);
       rtx_insn *insn = get_last_insn ();
 
       /* For the initial allocation, we don't have a frame pointer
@@ -10654,7 +10731,7 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
       else if (final_adjustment_p && rounded_size == 0)
 	residual_probe_offset = 0;
 
-      aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
+      aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
       if (residual >= min_probe_threshold)
 	{
 	  if (dump_file)
@@ -10670,6 +10747,14 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
     }
 }
 
+/* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE.  */
+
+static bool
+aarch64_use_late_prologue_epilogue ()
+{
+  return aarch64_cfun_enables_pstate_sm ();
+}
+
 /* Return 1 if the register is used by the epilogue.  We need to say the
    return register is used, but only after epilogue generation is complete.
    Note that in the case of sibcalls, the values "used by the epilogue" are
@@ -10826,6 +10911,9 @@  aarch64_expand_prologue (void)
   unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
   rtx_insn *insn;
+  aarch64_feature_flags force_isa_mode = 0;
+  if (aarch64_cfun_enables_pstate_sm ())
+    force_isa_mode = AARCH64_FL_SM_ON;
 
   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
     {
@@ -10887,7 +10975,7 @@  aarch64_expand_prologue (void)
      less the amount of the guard reserved for use by the caller's
      outgoing args.  */
   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
-					  true, false);
+					  force_isa_mode, true, false);
 
   if (callee_adjust != 0)
     aarch64_push_regs (reg1, reg2, callee_adjust);
@@ -10913,7 +11001,8 @@  aarch64_expand_prologue (void)
 	gcc_assert (known_eq (chain_offset, 0));
       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
 			  stack_pointer_rtx, chain_offset,
-			  tmp1_rtx, tmp0_rtx, frame_pointer_needed);
+			  tmp1_rtx, tmp0_rtx, force_isa_mode,
+			  frame_pointer_needed);
       if (frame_pointer_needed && !frame_size.is_constant ())
 	{
 	  /* Variable-sized frames need to describe the save slot
@@ -10956,6 +11045,7 @@  aarch64_expand_prologue (void)
 		  || known_eq (initial_adjust, 0));
       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
 					      sve_callee_adjust,
+					      force_isa_mode,
 					      !frame_pointer_needed, false);
       saved_regs_offset += sve_callee_adjust;
     }
@@ -10968,10 +11058,13 @@  aarch64_expand_prologue (void)
   /* We may need to probe the final adjustment if it is larger than the guard
      that is assumed by the called.  */
   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
+					  force_isa_mode,
 					  !frame_pointer_needed, true);
 
-  /* Save the incoming value of PSTATE.SM, if required.  */
-  if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
+  /* Save the incoming value of PSTATE.SM, if required.  Code further
+     down does this for locally-streaming functions.  */
+  if (known_ge (cfun->machine->frame.old_svcr_offset, 0)
+      && !aarch64_cfun_enables_pstate_sm ())
     {
       rtx mem = aarch64_old_svcr_mem ();
       MEM_VOLATILE_P (mem) = 1;
@@ -11022,7 +11115,40 @@  aarch64_expand_prologue (void)
       emit_insn (gen_aarch64_tpidr2_save ());
       emit_insn (gen_aarch64_clear_tpidr2 ());
       emit_label (label);
-      emit_insn (gen_aarch64_smstart_za ());
+      if (!aarch64_cfun_enables_pstate_sm ()
+	  || known_ge (cfun->machine->frame.old_svcr_offset, 0))
+	emit_insn (gen_aarch64_smstart_za ());
+    }
+
+  /* Enable PSTATE.SM, if required.  */
+  if (aarch64_cfun_enables_pstate_sm ())
+    {
+      rtx_insn *guard_label = nullptr;
+      if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
+	{
+	  /* The current function is streaming-compatible.  Save the
+	     original state of PSTATE.SM.  */
+	  rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
+	  emit_insn (gen_aarch64_read_svcr (svcr));
+	  emit_move_insn (aarch64_old_svcr_mem (), svcr);
+	  guard_label = aarch64_guard_switch_pstate_sm (svcr,
+							aarch64_isa_flags);
+	}
+      aarch64_sme_mode_switch_regs args_switch;
+      auto &args = crtl->args.info;
+      for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
+	{
+	  rtx x = args.sme_mode_switch_args[i];
+	  args_switch.add_reg (GET_MODE (x), REGNO (x));
+	}
+      args_switch.emit_prologue ();
+      if (cfun->machine->frame.has_new_za_state && !guard_label)
+	emit_insn (gen_aarch64_smstart ());
+      else
+	emit_insn (gen_aarch64_smstart_sm ());
+      args_switch.emit_epilogue ();
+      if (guard_label)
+	emit_label (guard_label);
     }
 }
 
@@ -11073,6 +11199,9 @@  aarch64_expand_epilogue (rtx_call_insn *sibcall)
   HOST_WIDE_INT guard_size
     = 1 << param_stack_clash_protection_guard_size;
   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
+  aarch64_feature_flags force_isa_mode = 0;
+  if (aarch64_cfun_enables_pstate_sm ())
+    force_isa_mode = AARCH64_FL_SM_ON;
 
   /* We can re-use the registers when:
 
@@ -11097,7 +11226,33 @@  aarch64_expand_epilogue (rtx_call_insn *sibcall)
     = maybe_ne (get_frame_size ()
 		+ cfun->machine->frame.saved_varargs_size, 0);
 
-  if (cfun->machine->frame.has_new_za_state)
+  /* Reset PSTATE.SM, if required.  Fold an unconditional SMSTOP SM
+     and SMSTOP ZA into a single SMSTOP.  */
+  bool pending_smstop_za = cfun->machine->frame.has_new_za_state;
+  if (aarch64_cfun_enables_pstate_sm ())
+    {
+      rtx_insn *guard_label = nullptr;
+      if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
+	guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
+						      aarch64_isa_flags);
+      aarch64_sme_mode_switch_regs args_switch;
+      if (crtl->return_rtx && REG_P (crtl->return_rtx))
+	args_switch.add_reg (GET_MODE (crtl->return_rtx),
+			     REGNO (crtl->return_rtx));
+      args_switch.emit_prologue ();
+      if (pending_smstop_za && !guard_label)
+	{
+	  emit_insn (gen_aarch64_smstop ());
+	  pending_smstop_za = false;
+	}
+      else
+	emit_insn (gen_aarch64_smstop_sm ());
+      args_switch.emit_epilogue ();
+      if (guard_label)
+	emit_label (guard_label);
+    }
+
+  if (pending_smstop_za)
     /* Turn ZA off before returning.  TPIDR2_EL0 is already null at
        this point.  */
     emit_insn (gen_aarch64_smstop_za ());
@@ -11122,12 +11277,13 @@  aarch64_expand_epilogue (rtx_call_insn *sibcall)
     aarch64_add_offset (Pmode, stack_pointer_rtx,
 			hard_frame_pointer_rtx,
 			-callee_offset - below_hard_fp_saved_regs_size,
-			tmp1_rtx, tmp0_rtx, callee_adjust == 0);
+			tmp1_rtx, tmp0_rtx, force_isa_mode,
+			callee_adjust == 0);
   else
      /* The case where we need to re-use the register here is very rare, so
 	avoid the complicated condition and just always emit a move if the
 	immediate doesn't fit.  */
-     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
+     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
 
   /* Restore the vector registers before the predicate registers,
      so that we can use P4 as a temporary for big-endian SVE frames.  */
@@ -11136,7 +11292,8 @@  aarch64_expand_epilogue (rtx_call_insn *sibcall)
   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
 				false, &cfi_ops);
   if (maybe_ne (sve_callee_adjust, 0))
-    aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
+    aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
+		    force_isa_mode, true);
 
   /* When shadow call stack is enabled, the scs_pop in the epilogue will
      restore x30, we don't need to restore x30 again in the traditional
@@ -11167,7 +11324,7 @@  aarch64_expand_epilogue (rtx_call_insn *sibcall)
 
   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
      add restriction on emit_move optimization to leaf functions.  */
-  aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
+  aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
 		  (!can_inherit_p || !crtl->is_leaf
 		   || df_regs_ever_live_p (EP0_REGNUM)));
 
@@ -11300,7 +11457,8 @@  aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
 
   if (vcall_offset == 0)
-    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
+    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
+			0, false);
   else
     {
       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
@@ -11313,7 +11471,7 @@  aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
 				       plus_constant (Pmode, this_rtx, delta));
 	  else
 	    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
-				temp1, temp0, false);
+				temp1, temp0, 0, false);
 	}
 
       if (Pmode == ptr_mode)
@@ -29469,6 +29627,9 @@  aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_HAVE_SHADOW_CALL_STACK
 #define TARGET_HAVE_SHADOW_CALL_STACK true
 
+#undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
+#define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
+
 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
new file mode 100644
index 00000000000..ab9c8cd6bac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
@@ -0,0 +1,433 @@ 
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+// { dg-final { check-function-bodies "**" "" } }
+
+__attribute__((arm_streaming, arm_shared_za)) void consume_za ();
+
+/*
+** n_ls:
+**	stp	d8, d9, \[sp, #?-64\]!
+**	stp	d10, d11, \[sp, #?16\]
+**	stp	d12, d13, \[sp, #?32\]
+**	stp	d14, d15, \[sp, #?48\]
+**	smstart	sm
+**	smstop	sm
+**	ldp	d10, d11, \[sp, #?16\]
+**	ldp	d12, d13, \[sp, #?32\]
+**	ldp	d14, d15, \[sp, #?48\]
+**	ldp	d8, d9, \[sp\], #?64
+**	ret
+*/
+void __attribute__((arm_locally_streaming))
+n_ls ()
+{
+  asm ("");
+}
+
+/*
+** s_ls:
+**	ret
+*/
+void __attribute__((arm_streaming, arm_locally_streaming))
+s_ls ()
+{
+  asm ("");
+}
+
+/*
+** sc_ls:
+**	stp	x29, x30, \[sp, #?-96\]!
+**	mov	x29, sp
+**	stp	d8, d9, \[sp, #?32\]
+**	stp	d10, d11, \[sp, #?48\]
+**	stp	d12, d13, \[sp, #?64\]
+**	stp	d14, d15, \[sp, #?80\]
+**	mrs	x16, svcr
+**	str	x16, \[x29, #?16\]
+**	tbnz	x16, 0, .*
+**	smstart	sm
+**	ldr	x16, \[x29, #?16\]
+**	tbnz	x16, 0, .*
+**	smstop	sm
+**	ldp	d8, d9, \[sp, #?32\]
+**	ldp	d10, d11, \[sp, #?48\]
+**	ldp	d12, d13, \[sp, #?64\]
+**	ldp	d14, d15, \[sp, #?80\]
+**	ldp	x29, x30, \[sp\], #?96
+**	ret
+*/
+void __attribute__((arm_streaming_compatible, arm_locally_streaming))
+sc_ls ()
+{
+  asm ("");
+}
+
+/*
+** n_ls_new_za:
+**	str	x30, \[sp, #?-80\]!
+**	stp	d8, d9, \[sp, #?16\]
+**	stp	d10, d11, \[sp, #?32\]
+**	stp	d12, d13, \[sp, #?48\]
+**	stp	d14, d15, \[sp, #?64\]
+**	mrs	x11, tpidr2_el0
+**	cbz	x11, .*
+**	bl	__arm_tpidr2_save
+**	msr	tpidr2_el0, xzr
+**	smstart
+**	bl	consume_za
+**	smstop
+**	ldp	d8, d9, \[sp, #?16\]
+**	ldp	d10, d11, \[sp, #?32\]
+**	ldp	d12, d13, \[sp, #?48\]
+**	ldp	d14, d15, \[sp, #?64\]
+**	ldr	x30, \[sp\], #?80
+**	ret
+*/
+void __attribute__((arm_locally_streaming, arm_new_za))
+n_ls_new_za ()
+{
+  consume_za ();
+}
+
+/*
+** s_ls_new_za:
+**	str	x30, \[sp, #?-16\]!
+**	mrs	x11, tpidr2_el0
+**	cbz	x11, .*
+**	bl	__arm_tpidr2_save
+**	msr	tpidr2_el0, xzr
+**	smstart	za
+**	bl	consume_za
+**	smstop	za
+**	ldr	x30, \[sp\], #?16
+**	ret
+*/
+void __attribute__((arm_locally_streaming, arm_streaming, arm_new_za))
+s_ls_new_za ()
+{
+  consume_za ();
+}
+
+/*
+** sc_ls_new_za:
+**	stp	x29, x30, \[sp, #?-96\]!
+**	mov	x29, sp
+**	stp	d8, d9, \[sp, #?32\]
+**	stp	d10, d11, \[sp, #?48\]
+**	stp	d12, d13, \[sp, #?64\]
+**	stp	d14, d15, \[sp, #?80\]
+**	mrs	x11, tpidr2_el0
+**	cbz	x11, .*
+**	bl	__arm_tpidr2_save
+**	msr	tpidr2_el0, xzr
+**	smstart	za
+**	mrs	x16, svcr
+**	str	x16, \[x29, #?16\]
+**	tbnz	x16, 0, .*
+**	smstart	sm
+**	bl	consume_za
+**	ldr	x16, \[x29, #?16\]
+**	tbnz	x16, 0, .*
+**	smstop	sm
+**	smstop	za
+**	ldp	d8, d9, \[sp, #?32\]
+**	ldp	d10, d11, \[sp, #?48\]
+**	ldp	d12, d13, \[sp, #?64\]
+**	ldp	d14, d15, \[sp, #?80\]
+**	ldp	x29, x30, \[sp\], #?96
+**	ret
+*/
+void __attribute__((arm_streaming_compatible, arm_locally_streaming, arm_new_za))
+sc_ls_new_za ()
+{
+  consume_za ();
+}
+
+/*
+** n_ls_shared_za:
+**	str	x30, \[sp, #?-80\]!
+**	stp	d8, d9, \[sp, #?16\]
+**	stp	d10, d11, \[sp, #?32\]
+**	stp	d12, d13, \[sp, #?48\]
+**	stp	d14, d15, \[sp, #?64\]
+**	smstart	sm
+**	bl	consume_za
+**	smstop	sm
+**	ldp	d8, d9, \[sp, #?16\]
+**	ldp	d10, d11, \[sp, #?32\]
+**	ldp	d12, d13, \[sp, #?48\]
+**	ldp	d14, d15, \[sp, #?64\]
+**	ldr	x30, \[sp\], #?80
+**	ret
+*/
+void __attribute__((arm_locally_streaming, arm_shared_za))
+n_ls_shared_za ()
+{
+  consume_za ();
+}
+
+/*
+** s_ls_shared_za:
+**	str	x30, \[sp, #?-16\]!
+**	bl	consume_za
+**	ldr	x30, \[sp\], #?16
+**	ret
+*/
+void __attribute__((arm_streaming, arm_locally_streaming, arm_shared_za))
+s_ls_shared_za ()
+{
+  consume_za ();
+}
+
+/*
+** sc_ls_shared_za:
+**	stp	x29, x30, \[sp, #?-96\]!
+**	mov	x29, sp
+**	stp	d8, d9, \[sp, #?32\]
+**	stp	d10, d11, \[sp, #?48\]
+**	stp	d12, d13, \[sp, #?64\]
+**	stp	d14, d15, \[sp, #?80\]
+**	mrs	x16, svcr
+**	str	x16, \[x29, #?16\]
+**	tbnz	x16, 0, .*
+**	smstart	sm
+**	bl	consume_za
+**	ldr	x16, \[x29, #?16\]
+**	tbnz	x16, 0, .*
+**	smstop	sm
+**	ldp	d8, d9, \[sp, #?32\]
+**	ldp	d10, d11, \[sp, #?48\]
+**	ldp	d12, d13, \[sp, #?64\]
+**	ldp	d14, d15, \[sp, #?80\]
+**	ldp	x29, x30, \[sp\], #?96
+**	ret
+*/
+void __attribute__((arm_streaming_compatible, arm_locally_streaming, arm_shared_za))
+sc_ls_shared_za ()
+{
+  consume_za ();
+}
+
+/*
+** n_ls_vector_pcs:
+**	stp	q8, q9, \[sp, #?-256\]!
+**	stp	q10, q11, \[sp, #?32\]
+**	stp	q12, q13, \[sp, #?64\]
+**	stp	q14, q15, \[sp, #?96\]
+**	stp	q16, q17, \[sp, #?128\]
+**	stp	q18, q19, \[sp, #?160\]
+**	stp	q20, q21, \[sp, #?192\]
+**	stp	q22, q23, \[sp, #?224\]
+**	smstart	sm
+**	smstop	sm
+**	ldp	q10, q11, \[sp, #?32\]
+**	ldp	q12, q13, \[sp, #?64\]
+**	ldp	q14, q15, \[sp, #?96\]
+**	ldp	q16, q17, \[sp, #?128\]
+**	ldp	q18, q19, \[sp, #?160\]
+**	ldp	q20, q21, \[sp, #?192\]
+**	ldp	q22, q23, \[sp, #?224\]
+**	ldp	q8, q9, \[sp\], #?256
+**	ret
+*/
+void __attribute__((arm_locally_streaming, aarch64_vector_pcs))
+n_ls_vector_pcs ()
+{
+  asm ("");
+}
+
+/*
+** n_ls_sve_pcs:
+**	addsvl	sp, sp, #-18
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	p12, \[sp, #8, mul vl\]
+**	str	p13, \[sp, #9, mul vl\]
+**	str	p14, \[sp, #10, mul vl\]
+**	str	p15, \[sp, #11, mul vl\]
+**	str	z8, \[sp, #2, mul vl\]
+**	str	z9, \[sp, #3, mul vl\]
+**	str	z10, \[sp, #4, mul vl\]
+**	str	z11, \[sp, #5, mul vl\]
+**	str	z12, \[sp, #6, mul vl\]
+**	str	z13, \[sp, #7, mul vl\]
+**	str	z14, \[sp, #8, mul vl\]
+**	str	z15, \[sp, #9, mul vl\]
+**	str	z16, \[sp, #10, mul vl\]
+**	str	z17, \[sp, #11, mul vl\]
+**	str	z18, \[sp, #12, mul vl\]
+**	str	z19, \[sp, #13, mul vl\]
+**	str	z20, \[sp, #14, mul vl\]
+**	str	z21, \[sp, #15, mul vl\]
+**	str	z22, \[sp, #16, mul vl\]
+**	str	z23, \[sp, #17, mul vl\]
+**	addvl	sp, sp, #-1
+**	str	p0, \[sp\]
+**	smstart	sm
+**	ldr	p0, \[sp\]
+**	addvl	sp, sp, #1
+**	smstop	sm
+**	ldr	z8, \[sp, #2, mul vl\]
+**	ldr	z9, \[sp, #3, mul vl\]
+**	ldr	z10, \[sp, #4, mul vl\]
+**	ldr	z11, \[sp, #5, mul vl\]
+**	ldr	z12, \[sp, #6, mul vl\]
+**	ldr	z13, \[sp, #7, mul vl\]
+**	ldr	z14, \[sp, #8, mul vl\]
+**	ldr	z15, \[sp, #9, mul vl\]
+**	ldr	z16, \[sp, #10, mul vl\]
+**	ldr	z17, \[sp, #11, mul vl\]
+**	ldr	z18, \[sp, #12, mul vl\]
+**	ldr	z19, \[sp, #13, mul vl\]
+**	ldr	z20, \[sp, #14, mul vl\]
+**	ldr	z21, \[sp, #15, mul vl\]
+**	ldr	z22, \[sp, #16, mul vl\]
+**	ldr	z23, \[sp, #17, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	ldr	p12, \[sp, #8, mul vl\]
+**	ldr	p13, \[sp, #9, mul vl\]
+**	ldr	p14, \[sp, #10, mul vl\]
+**	ldr	p15, \[sp, #11, mul vl\]
+**	addsvl	sp, sp, #18
+**	ret
+*/
+void __attribute__((arm_locally_streaming))
+n_ls_sve_pcs (__SVBool_t x)
+{
+  asm ("");
+}
+
+/*
+** n_ls_v0:
+**	addsvl	sp, sp, #-1
+**	...
+**	smstart	sm
+**	add	x[0-9]+, .*
+**	smstop	sm
+**	...
+**	addsvl	sp, sp, #1
+**	...
+*/
+#define TEST(VN) __SVInt32_t VN; asm ("" :: "r" (&VN));
+void __attribute__((arm_locally_streaming))
+n_ls_v0 ()
+{
+  TEST (v0);
+}
+
+/*
+** n_ls_v32:
+**	addsvl	sp, sp, #-32
+**	...
+**	smstart	sm
+**	...
+**	smstop	sm
+**	...
+**	rdsvl	(x[0-9]+), #1
+**	lsl	(x[0-9]+), \1, #?5
+**	add	sp, sp, \2
+**	...
+*/
+void __attribute__((arm_locally_streaming))
+n_ls_v32 ()
+{
+  TEST (v0);
+  TEST (v1);
+  TEST (v2);
+  TEST (v3);
+  TEST (v4);
+  TEST (v5);
+  TEST (v6);
+  TEST (v7);
+  TEST (v8);
+  TEST (v9);
+  TEST (v10);
+  TEST (v11);
+  TEST (v12);
+  TEST (v13);
+  TEST (v14);
+  TEST (v15);
+  TEST (v16);
+  TEST (v17);
+  TEST (v18);
+  TEST (v19);
+  TEST (v20);
+  TEST (v21);
+  TEST (v22);
+  TEST (v23);
+  TEST (v24);
+  TEST (v25);
+  TEST (v26);
+  TEST (v27);
+  TEST (v28);
+  TEST (v29);
+  TEST (v30);
+  TEST (v31);
+}
+
+/*
+** n_ls_v33:
+**	rdsvl	(x[0-9]+), #1
+**	mov	(x[0-9]+), #?33
+**	mul	(x[0-9]+), (?:\1, \2|\2, \1)
+**	sub	sp, sp, \3
+**	...
+**	smstart	sm
+**	...
+**	smstop	sm
+**	...
+**	rdsvl	(x[0-9]+), #1
+**	mov	(x[0-9]+), #?33
+**	mul	(x[0-9]+), (?:\4, \5|\5, \4)
+**	add	sp, sp, \6
+**	...
+*/
+void __attribute__((arm_locally_streaming))
+n_ls_v33 ()
+{
+  TEST (v0);
+  TEST (v1);
+  TEST (v2);
+  TEST (v3);
+  TEST (v4);
+  TEST (v5);
+  TEST (v6);
+  TEST (v7);
+  TEST (v8);
+  TEST (v9);
+  TEST (v10);
+  TEST (v11);
+  TEST (v12);
+  TEST (v13);
+  TEST (v14);
+  TEST (v15);
+  TEST (v16);
+  TEST (v17);
+  TEST (v18);
+  TEST (v19);
+  TEST (v20);
+  TEST (v21);
+  TEST (v22);
+  TEST (v23);
+  TEST (v24);
+  TEST (v25);
+  TEST (v26);
+  TEST (v27);
+  TEST (v28);
+  TEST (v29);
+  TEST (v30);
+  TEST (v31);
+  TEST (v32);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
new file mode 100644
index 00000000000..4c9caf5d078
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
@@ -0,0 +1,177 @@ 
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+/*
+** test_d0:
+**	...
+**	smstart	sm
+**	.*
+**	fmov	x10, d0
+**	smstop	sm
+**	fmov	d0, x10
+**	...
+*/
+double __attribute__((arm_locally_streaming))
+test_d0 ()
+{
+  asm ("");
+  return 1.0f;
+}
+
+/*
+** test_d0_vec:
+**	...
+**	smstart	sm
+**	.*
+** (
+**	fmov	x10, d0
+** |
+**	umov	x10, v0.d\[0\]
+** )
+**	smstop	sm
+**	fmov	d0, x10
+**	...
+*/
+int8x8_t __attribute__((arm_locally_streaming))
+test_d0_vec ()
+{
+  asm volatile ("");
+  return (int8x8_t) {};
+}
+
+/*
+** test_q0:
+**	...
+**	smstart	sm
+**	.*
+**	str	q0, \[sp, #?-16\]!
+**	smstop	sm
+**	ldr	q0, \[sp\], #?16
+**	...
+*/
+int8x16_t __attribute__((arm_locally_streaming))
+test_q0 ()
+{
+  asm volatile ("");
+  return (int8x16_t) {};
+}
+
+/*
+** test_q1:
+**	...
+**	smstart	sm
+**	...
+**	stp	q0, q1, \[sp, #?-32\]!
+**	smstop	sm
+**	ldp	q0, q1, \[sp\], #?32
+**	...
+*/
+int8x16x2_t __attribute__((arm_locally_streaming))
+test_q1 ()
+{
+  asm volatile ("");
+  return (int8x16x2_t) {};
+}
+
+/*
+** test_q2:
+**	...
+**	smstart	sm
+**	...
+**	stp	q0, q1, \[sp, #?-48\]!
+**	str	q2, \[sp, #?32\]
+**	smstop	sm
+**	ldr	q2, \[sp, #?32\]
+**	ldp	q0, q1, \[sp\], #?48
+**	...
+*/
+int8x16x3_t __attribute__((arm_locally_streaming))
+test_q2 ()
+{
+  asm volatile ("");
+  return (int8x16x3_t) {};
+}
+
+/*
+** test_q3:
+**	...
+**	smstart	sm
+**	...
+**	stp	q0, q1, \[sp, #?-64\]!
+**	stp	q2, q3, \[sp, #?32\]
+**	smstop	sm
+**	ldp	q2, q3, \[sp, #?32\]
+**	ldp	q0, q1, \[sp\], #?64
+**	...
+*/
+int8x16x4_t __attribute__((arm_locally_streaming))
+test_q3 ()
+{
+  asm volatile ("");
+  return (int8x16x4_t) {};
+}
+
+/*
+** test_z0:
+**	...
+**	smstart	sm
+**	mov	z0\.b, #0
+**	addvl	sp, sp, #-1
+**	str	z0, \[sp\]
+**	smstop	sm
+**	ldr	z0, \[sp\]
+**	addvl	sp, sp, #1
+**	...
+*/
+svint8_t __attribute__((arm_locally_streaming))
+test_z0 ()
+{
+  asm volatile ("");
+  return (svint8_t) {};
+}
+
+/*
+** test_z3:
+**	...
+**	smstart	sm
+**	...
+**	addvl	sp, sp, #-4
+**	str	z0, \[sp\]
+**	str	z1, \[sp, #1, mul vl\]
+**	str	z2, \[sp, #2, mul vl\]
+**	str	z3, \[sp, #3, mul vl\]
+**	smstop	sm
+**	ldr	z0, \[sp\]
+**	ldr	z1, \[sp, #1, mul vl\]
+**	ldr	z2, \[sp, #2, mul vl\]
+**	ldr	z3, \[sp, #3, mul vl\]
+**	...
+*/
+svint8x4_t __attribute__((arm_locally_streaming))
+test_z3 ()
+{
+  asm volatile ("");
+  return (svint8x4_t) {};
+}
+
+/*
+** test_p0:
+**	...
+**	smstart	sm
+**	pfalse	p0\.b
+**	addvl	sp, sp, #-1
+**	str	p0, \[sp\]
+**	smstop	sm
+**	ldr	p0, \[sp\]
+**	addvl	sp, sp, #1
+**	...
+*/
+svbool_t __attribute__((arm_locally_streaming))
+test_p0 ()
+{
+  asm volatile ("");
+  return (svbool_t) {};
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
new file mode 100644
index 00000000000..e6cbd9d176d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
@@ -0,0 +1,273 @@ 
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+/*
+** test_d0:
+**	...
+**	fmov	x10, d0
+**	smstart	sm
+**	fmov	d0, x10
+**	smstop	sm
+**	...
+*/
+void __attribute__((arm_locally_streaming))
+test_d0 (double d0)
+{
+  asm ("");
+}
+
+/*
+** test_d7:
+**	...
+**	fmov	x10, d0
+**	fmov	x11, d1
+**	fmov	x12, d2
+**	fmov	x13, d3
+**	fmov	x14, d4
+**	fmov	x15, d5
+**	fmov	x16, d6
+**	fmov	x17, d7
+**	smstart	sm
+**	fmov	d0, x10
+**	fmov	d1, x11
+**	fmov	d2, x12
+**	fmov	d3, x13
+**	fmov	d4, x14
+**	fmov	d5, x15
+**	fmov	d6, x16
+**	fmov	d7, x17
+**	smstop	sm
+**	...
+*/
+void __attribute__((arm_locally_streaming))
+test_d7 (double d0, double d1, double d2, double d3,
+	 double d4, double d5, double d6, double d7)
+{
+  asm volatile ("");
+}
+
+/*
+** test_d0_vec:
+**	...
+** (
+**	fmov	x10, d0
+** |
+**	umov	x10, v0.d\[0\]
+** )
+**	smstart	sm
+**	fmov	d0, x10
+**	smstop	sm
+**	...
+*/
+void __attribute__((arm_locally_streaming))
+test_d0_vec (int8x8_t d0)
+{
+  asm volatile ("");
+}
+
+/*
+** test_d7_vec:
+**	...
+** (
+**	fmov	x10, d0
+**	fmov	x11, d1
+**	fmov	x12, d2
+**	fmov	x13, d3
+**	fmov	x14, d4
+**	fmov	x15, d5
+**	fmov	x16, d6
+**	fmov	x17, d7
+** |
+**	umov	x10, v0.d\[0\]
+**	umov	x11, v1.d\[0\]
+**	umov	x12, v2.d\[0\]
+**	umov	x13, v3.d\[0\]
+**	umov	x14, v4.d\[0\]
+**	umov	x15, v5.d\[0\]
+**	umov	x16, v6.d\[0\]
+**	umov	x17, v7.d\[0\]
+** )
+**	smstart	sm
+**	fmov	d0, x10
+**	fmov	d1, x11
+**	fmov	d2, x12
+**	fmov	d3, x13
+**	fmov	d4, x14
+**	fmov	d5, x15
+**	fmov	d6, x16
+**	fmov	d7, x17
+**	smstop	sm
+**	...
+*/
+void __attribute__((arm_locally_streaming))
+test_d7_vec (int8x8_t d0, int8x8_t d1, int8x8_t d2, int8x8_t d3,
+	     int8x8_t d4, int8x8_t d5, int8x8_t d6, int8x8_t d7)
+{
+  asm volatile ("");
+}
+
+/*
+** test_q0:
+**	...
+**	str	q0, \[sp, #?-16\]!
+**	smstart	sm
+**	ldr	q0, \[sp\], #?16
+**	smstop	sm
+**	...
+*/
+void __attribute__((arm_locally_streaming))
+test_q0 (int8x16_t q0)
+{
+  asm volatile ("");
+}
+
+/*
+** test_q7:
+**	...
+**	stp	q0, q1, \[sp, #?-128\]!
+**	stp	q2, q3, \[sp, #?32\]
+**	stp	q4, q5, \[sp, #?64\]
+**	stp	q6, q7, \[sp, #?96\]
+**	smstart	sm
+**	ldp	q2, q3, \[sp, #?32\]
+**	ldp	q4, q5, \[sp, #?64\]
+**	ldp	q6, q7, \[sp, #?96\]
+**	ldp	q0, q1, \[sp\], #?128
+**	smstop	sm
+**	...
+*/
+void __attribute__((arm_locally_streaming))
+test_q7 (int8x16x4_t q0, int8x16x4_t q4)
+{
+  asm volatile ("");
+}
+
+/*
+** test_z0:
+**	...
+**	addvl	sp, sp, #-1
+**	str	z0, \[sp\]
+**	smstart	sm
+**	ldr	z0, \[sp\]
+**	addvl	sp, sp, #1
+**	smstop	sm
+**	...
+*/
+void __attribute__((arm_locally_streaming))
+test_z0 (svint8_t z0)
+{
+  asm volatile ("");
+}
+
+/*
+** test_z7:
+**	...
+**	addvl	sp, sp, #-8
+**	str	z0, \[sp\]
+**	str	z1, \[sp, #1, mul vl\]
+**	str	z2, \[sp, #2, mul vl\]
+**	str	z3, \[sp, #3, mul vl\]
+**	str	z4, \[sp, #4, mul vl\]
+**	str	z5, \[sp, #5, mul vl\]
+**	str	z6, \[sp, #6, mul vl\]
+**	str	z7, \[sp, #7, mul vl\]
+**	smstart	sm
+**	ldr	z0, \[sp\]
+**	ldr	z1, \[sp, #1, mul vl\]
+**	ldr	z2, \[sp, #2, mul vl\]
+**	ldr	z3, \[sp, #3, mul vl\]
+**	ldr	z4, \[sp, #4, mul vl\]
+**	ldr	z5, \[sp, #5, mul vl\]
+**	ldr	z6, \[sp, #6, mul vl\]
+**	ldr	z7, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #8
+**	smstop	sm
+**	...
+*/
+void __attribute__((arm_locally_streaming))
+test_z7 (svint8x4_t z0, svint8x4_t z4)
+{
+  asm volatile ("");
+}
+
+/*
+** test_p0:
+**	...
+**	addvl	sp, sp, #-1
+**	str	p0, \[sp\]
+**	smstart	sm
+**	ldr	p0, \[sp\]
+**	addvl	sp, sp, #1
+**	smstop	sm
+**	...
+*/
+void __attribute__((arm_locally_streaming))
+test_p0 (svbool_t p0)
+{
+  asm volatile ("");
+}
+
+/*
+** test_p3:
+**	...
+**	addvl	sp, sp, #-1
+**	str	p0, \[sp\]
+**	str	p1, \[sp, #1, mul vl\]
+**	str	p2, \[sp, #2, mul vl\]
+**	str	p3, \[sp, #3, mul vl\]
+**	smstart	sm
+**	ldr	p0, \[sp\]
+**	ldr	p1, \[sp, #1, mul vl\]
+**	ldr	p2, \[sp, #2, mul vl\]
+**	ldr	p3, \[sp, #3, mul vl\]
+**	addvl	sp, sp, #1
+**	smstop	sm
+**	...
+*/
+void __attribute__((arm_locally_streaming))
+test_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm volatile ("");
+}
+
+/*
+** test_mixed:
+**	...
+**	addvl	sp, sp, #-3
+**	str	p0, \[sp\]
+**	str	p1, \[sp, #1, mul vl\]
+**	str	p2, \[sp, #2, mul vl\]
+**	str	p3, \[sp, #3, mul vl\]
+**	str	z3, \[sp, #1, mul vl\]
+**	str	z7, \[sp, #2, mul vl\]
+**	stp	q2, q6, \[sp, #?-32\]!
+**	fmov	w10, s0
+**	fmov	x11, d1
+**	fmov	w12, s4
+**	fmov	x13, d5
+**	smstart	sm
+**	fmov	s0, w10
+**	fmov	d1, x11
+**	fmov	s4, w12
+**	fmov	d5, x13
+**	ldp	q2, q6, \[sp\], #?32
+**	ldr	p0, \[sp\]
+**	ldr	p1, \[sp, #1, mul vl\]
+**	ldr	p2, \[sp, #2, mul vl\]
+**	ldr	p3, \[sp, #3, mul vl\]
+**	ldr	z3, \[sp, #1, mul vl\]
+**	ldr	z7, \[sp, #2, mul vl\]
+**	addvl	sp, sp, #3
+**	smstop	sm
+**	...
+*/
+void __attribute__((arm_locally_streaming))
+test_mixed (float s0, double d1, float32x4_t q2, svfloat32_t z3,
+	    float s4, double d5, float64x2_t q6, svfloat64_t z7,
+	    svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm volatile ("");
+}