[18/21] aarch64: Add support for __arm_locally_streaming

Message ID mpty1ewqod9.fsf@arm.com
State Committed
Commit 3f6e5991fab507aa79121dc44d1afcd622c78744
Headers
Series aarch64: Add support for SME |

Commit Message

Richard Sandiford Nov. 17, 2023, 5:30 p.m. UTC
  This patch adds support for the __arm_locally_streaming attribute,
which allows a function to use SME internally without changing
the function's ABI.  The attribute is valid but redundant for
__arm_streaming functions.

gcc/
	* config/aarch64/aarch64.cc (aarch64_arm_attribute_table): Add
	arm::locally_streaming.
	(aarch64_fndecl_is_locally_streaming): New function.
	(aarch64_fndecl_sm_state): Handle locally-streaming functions.
	(aarch64_cfun_enables_pstate_sm): New function.
	(aarch64_add_offset): Add an argument that specifies whether
	the streaming vector length should be used instead of the
	prevailing one.
	(aarch64_split_add_offset, aarch64_add_sp, aarch64_sub_sp): Likewise.
	(aarch64_allocate_and_probe_stack_space): Likewise.
	(aarch64_expand_mov_immediate): Update calls accordingly.
	(aarch64_need_old_pstate_sm): Return true for locally-streaming
	streaming-compatible functions.
	(aarch64_layout_frame): Force all call-preserved Z and P registers
	to be saved and restored if the function switches PSTATE.SM in the
	prologue.
	(aarch64_get_separate_components): Disable shrink-wrapping of
	such Z and P saves and restores.
	(aarch64_use_late_prologue_epilogue): New function.
	(aarch64_expand_prologue): Measure SVE lengths in the streaming
	vector length for locally-streaming functions, then emit code
	to enable streaming mode.
	(aarch64_expand_epilogue): Likewise in reverse.
	(TARGET_USE_LATE_PROLOGUE_EPILOGUE): Define.
	* config/aarch64/aarch64-c.cc (aarch64_define_unconditional_macros):
	Define __arm_locally_streaming.

gcc/testsuite/
	* gcc.target/aarch64/sme/locally_streaming_1.c: New test.
	* gcc.target/aarch64/sme/locally_streaming_2.c: Likewise.
	* gcc.target/aarch64/sme/locally_streaming_3.c: Likewise.
	* gcc.target/aarch64/sme/locally_streaming_4.c: Likewise.
	* gcc.target/aarch64/sme/keyword_macros_1.c: Add
	__arm_locally_streaming.
	* g++.target/aarch64/sme/keyword_macros_1.C: Likewise.
---
 gcc/config/aarch64/aarch64-c.cc               |   1 +
 gcc/config/aarch64/aarch64.cc                 | 233 +++++++--
 .../g++.target/aarch64/sme/keyword_macros_1.C |   1 +
 .../gcc.target/aarch64/sme/keyword_macros_1.c |   1 +
 .../aarch64/sme/locally_streaming_1.c         | 466 ++++++++++++++++++
 .../aarch64/sme/locally_streaming_2.c         | 177 +++++++
 .../aarch64/sme/locally_streaming_3.c         | 273 ++++++++++
 .../aarch64/sme/locally_streaming_4.c         | 145 ++++++
 8 files changed, 1259 insertions(+), 38 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c
  

Patch

diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
index f2fa5df1b82..2a8ca46987a 100644
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -86,6 +86,7 @@  aarch64_define_unconditional_macros (cpp_reader *pfile)
 
   DEFINE_ARM_KEYWORD_MACRO ("streaming");
   DEFINE_ARM_KEYWORD_MACRO ("streaming_compatible");
+  DEFINE_ARM_KEYWORD_MACRO ("locally_streaming");
 
 #undef DEFINE_ARM_KEYWORD_MACRO
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 12753ac133e..6ad29a3a84f 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3136,6 +3136,7 @@  static const attribute_spec aarch64_arm_attributes[] =
 			  NULL, attr_streaming_exclusions },
   { "streaming_compatible", 0, 0, false, true,  true,  true,
 			  NULL, attr_streaming_exclusions },
+  { "locally_streaming",  0, 0, true, false, false, false, NULL, NULL },
   { "new",		  1, -1, true, false, false, false,
 			  handle_arm_new, NULL },
   { "preserves",	  1, -1, false, true,  true,  true,
@@ -4445,6 +4446,16 @@  aarch64_fntype_isa_mode (const_tree fntype)
 	  | aarch64_fntype_pstate_za (fntype));
 }
 
+/* Return true if FNDECL uses streaming mode internally, as an
+   implementation choice.  */
+
+static bool
+aarch64_fndecl_is_locally_streaming (const_tree fndecl)
+{
+  return lookup_attribute ("arm", "locally_streaming",
+			   DECL_ATTRIBUTES (fndecl));
+}
+
 /* Return the state of PSTATE.SM when compiling the body of
    function FNDECL.  This might be different from the state of
    PSTATE.SM on entry.  */
@@ -4452,6 +4463,9 @@  aarch64_fntype_isa_mode (const_tree fntype)
 static aarch64_feature_flags
 aarch64_fndecl_pstate_sm (const_tree fndecl)
 {
+  if (aarch64_fndecl_is_locally_streaming (fndecl))
+    return AARCH64_FL_SM_ON;
+
   return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
 }
 
@@ -4527,6 +4541,16 @@  aarch64_cfun_has_new_state (const char *state_name)
   return aarch64_fndecl_has_new_state (cfun->decl, state_name);
 }
 
+/* Return true if PSTATE.SM is 1 in the body of the current function,
+   but is not guaranteed to be 1 on entry.  */
+
+static bool
+aarch64_cfun_enables_pstate_sm ()
+{
+  return (aarch64_fndecl_is_locally_streaming (cfun->decl)
+	  && aarch64_cfun_incoming_pstate_sm () != AARCH64_FL_SM_ON);
+}
+
 /* Return true if the current function has state STATE_NAME, either by
    creating new state itself or by sharing state with callers.  */
 
@@ -6768,6 +6792,10 @@  aarch64_add_offset_temporaries (rtx x)
    TEMP2, if nonnull, is a second temporary register that doesn't
    overlap either DEST or REG.
 
+   FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET
+   is measured relative to the SME vector length instead of the current
+   prevailing vector length.  It is 0 otherwise.
+
    Since this function may be used to adjust the stack pointer, we must
    ensure that it cannot cause transient stack deallocation (for example
    by first incrementing SP and then decrementing when adjusting by a
@@ -6776,6 +6804,7 @@  aarch64_add_offset_temporaries (rtx x)
 static void
 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
 		    poly_int64 offset, rtx temp1, rtx temp2,
+		    aarch64_feature_flags force_isa_mode,
 		    bool frame_related_p, bool emit_move_imm = true)
 {
   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
@@ -6788,9 +6817,18 @@  aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
   /* Try using ADDVL or ADDPL to add the whole value.  */
   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
     {
-      rtx offset_rtx = gen_int_mode (offset, mode);
+      gcc_assert (offset.coeffs[0] == offset.coeffs[1]);
+      rtx offset_rtx;
+      if (force_isa_mode == 0)
+	offset_rtx = gen_int_mode (offset, mode);
+      else
+	offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
       RTX_FRAME_RELATED_P (insn) = frame_related_p;
+      if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON))
+	add_reg_note (insn, REG_CFA_ADJUST_CFA,
+		      gen_rtx_SET (dest, plus_constant (Pmode, src,
+							offset)));
       return;
     }
 
@@ -6806,11 +6844,19 @@  aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
   if (src != const0_rtx
       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
     {
-      rtx offset_rtx = gen_int_mode (poly_offset, mode);
+      rtx offset_rtx;
+      if (force_isa_mode == 0)
+	offset_rtx = gen_int_mode (poly_offset, mode);
+      else
+	offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
       if (frame_related_p)
 	{
 	  rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
 	  RTX_FRAME_RELATED_P (insn) = true;
+	  if (force_isa_mode & AARCH64_FL_SM_ON)
+	    add_reg_note (insn, REG_CFA_ADJUST_CFA,
+			  gen_rtx_SET (dest, plus_constant (Pmode, src,
+							    poly_offset)));
 	  src = dest;
 	}
       else
@@ -6841,9 +6887,19 @@  aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
       rtx val;
       if (IN_RANGE (rel_factor, -32, 31))
 	{
+	  if (force_isa_mode & AARCH64_FL_SM_ON)
+	    {
+	      /* Try to use an unshifted RDSVL, otherwise fall back on
+		 a shifted RDSVL #1.  */
+	      if (aarch64_sve_rdvl_addvl_factor_p (factor))
+		shift = 0;
+	      else
+		factor = rel_factor * 16;
+	      val = aarch64_sme_vq_immediate (mode, factor, 0);
+	    }
 	  /* Try to use an unshifted CNT[BHWD] or RDVL.  */
-	  if (aarch64_sve_cnt_factor_p (factor)
-	      || aarch64_sve_rdvl_addvl_factor_p (factor))
+	  else if (aarch64_sve_cnt_factor_p (factor)
+		   || aarch64_sve_rdvl_addvl_factor_p (factor))
 	    {
 	      val = gen_int_mode (poly_int64 (factor, factor), mode);
 	      shift = 0;
@@ -6873,11 +6929,18 @@  aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
 	     a shift and add sequence for the multiplication.
 	     If CNTB << SHIFT is out of range, stick with the current
 	     shift factor.  */
-	  if (IN_RANGE (low_bit, 2, 16 * 16))
+	  if (force_isa_mode == 0
+	      && IN_RANGE (low_bit, 2, 16 * 16))
 	    {
 	      val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
 	      shift = 0;
 	    }
+	  else if ((force_isa_mode & AARCH64_FL_SM_ON)
+		   && aarch64_sve_rdvl_addvl_factor_p (low_bit))
+	    {
+	      val = aarch64_sme_vq_immediate (mode, low_bit, 0);
+	      shift = 0;
+	    }
 	  else
 	    val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
 
@@ -6965,30 +7028,34 @@  aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
 			  rtx offset_rtx, rtx temp1, rtx temp2)
 {
   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
-		      temp1, temp2, false);
+		      temp1, temp2, 0, false);
 }
 
 /* Add DELTA to the stack pointer, marking the instructions frame-related.
-   TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
-   if TEMP1 already contains abs (DELTA).  */
+   TEMP1 is available as a temporary if nonnull.  FORCE_ISA_MODE is as
+   for aarch64_add_offset.  EMIT_MOVE_IMM is false if TEMP1 already
+   contains abs (DELTA).  */
 
 static inline void
-aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
+aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
+		aarch64_feature_flags force_isa_mode, bool emit_move_imm)
 {
   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
-		      temp1, temp2, true, emit_move_imm);
+		      temp1, temp2, force_isa_mode, true, emit_move_imm);
 }
 
 /* Subtract DELTA from the stack pointer, marking the instructions
-   frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
-   if nonnull.  */
+   frame-related if FRAME_RELATED_P.  FORCE_ISA_MODE is as for
+   aarch64_add_offset.  TEMP1 is available as a temporary if nonnull.  */
 
 static inline void
-aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
-		bool emit_move_imm = true)
+aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
+		aarch64_feature_flags force_isa_mode,
+		bool frame_related_p, bool emit_move_imm = true)
 {
   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
-		      temp1, temp2, frame_related_p, emit_move_imm);
+		      temp1, temp2, force_isa_mode, frame_related_p,
+		      emit_move_imm);
 }
 
 /* A streaming-compatible function needs to switch temporarily to the known
@@ -8014,11 +8081,11 @@  aarch64_expand_mov_immediate (rtx dest, rtx imm)
 		{
 		  base = aarch64_force_temporary (int_mode, dest, base);
 		  aarch64_add_offset (int_mode, dest, base, offset,
-				      NULL_RTX, NULL_RTX, false);
+				      NULL_RTX, NULL_RTX, 0, false);
 		}
 	      else
 		aarch64_add_offset (int_mode, dest, base, offset,
-				    dest, NULL_RTX, false);
+				    dest, NULL_RTX, 0, false);
 	    }
 	  return;
 	}
@@ -8045,7 +8112,7 @@  aarch64_expand_mov_immediate (rtx dest, rtx imm)
 	      gcc_assert (can_create_pseudo_p ());
 	      base = aarch64_force_temporary (int_mode, dest, base);
 	      aarch64_add_offset (int_mode, dest, base, const_offset,
-				  NULL_RTX, NULL_RTX, false);
+				  NULL_RTX, NULL_RTX, 0, false);
 	      return;
 	    }
 
@@ -8085,7 +8152,7 @@  aarch64_expand_mov_immediate (rtx dest, rtx imm)
 	      gcc_assert(can_create_pseudo_p ());
 	      base = aarch64_force_temporary (int_mode, dest, base);
 	      aarch64_add_offset (int_mode, dest, base, const_offset,
-				  NULL_RTX, NULL_RTX, false);
+				  NULL_RTX, NULL_RTX, 0, false);
 	      return;
 	    }
 	  /* FALLTHRU */
@@ -9728,6 +9795,9 @@  aarch64_need_old_pstate_sm ()
   if (aarch64_cfun_incoming_pstate_sm () != 0)
     return false;
 
+  if (aarch64_cfun_enables_pstate_sm ())
+    return true;
+
   if (cfun->machine->call_switches_pstate_sm)
     for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
       if (auto *call = dyn_cast<rtx_call_insn *> (insn))
@@ -9754,6 +9824,7 @@  aarch64_layout_frame (void)
   bool frame_related_fp_reg_p = false;
   aarch64_frame &frame = cfun->machine->frame;
   poly_int64 top_of_locals = -1;
+  bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
 
   vec_safe_truncate (frame.saved_gprs, 0);
   vec_safe_truncate (frame.saved_fprs, 0);
@@ -9791,7 +9862,7 @@  aarch64_layout_frame (void)
       frame.reg_offset[regno] = SLOT_REQUIRED;
 
   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
-    if (df_regs_ever_live_p (regno)
+    if ((enables_pstate_sm || df_regs_ever_live_p (regno))
 	&& !fixed_regs[regno]
 	&& !crtl->abi->clobbers_full_reg_p (regno))
       {
@@ -9820,7 +9891,7 @@  aarch64_layout_frame (void)
     }
 
   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
-    if (df_regs_ever_live_p (regno)
+    if ((enables_pstate_sm || df_regs_ever_live_p (regno))
 	&& !fixed_regs[regno]
 	&& !crtl->abi->clobbers_full_reg_p (regno))
       frame.reg_offset[regno] = SLOT_REQUIRED;
@@ -9937,7 +10008,8 @@  aarch64_layout_frame (void)
   /* If the current function changes the SVE vector length, ensure that the
      old value of the DWARF VG register is saved and available in the CFI,
      so that outer frames with VL-sized offsets can be processed correctly.  */
-  if (cfun->machine->call_switches_pstate_sm)
+  if (cfun->machine->call_switches_pstate_sm
+      || aarch64_cfun_enables_pstate_sm ())
     {
       frame.reg_offset[VG_REGNUM] = offset;
       offset += UNITS_PER_WORD;
@@ -10776,9 +10848,16 @@  aarch64_get_separate_components (void)
   bitmap_clear (components);
 
   /* The registers we need saved to the frame.  */
+  bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
     if (aarch64_register_saved_on_entry (regno))
       {
+	/* Disallow shrink wrapping for registers that will be clobbered
+	   by an SMSTART SM in the prologue.  */
+	if (enables_pstate_sm
+	    && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
+	  continue;
+
 	/* Punt on saves and restores that use ST1D and LD1D.  We could
 	   try to be smarter, but it would involve making sure that the
 	   spare predicate register itself is safe to use at the save
@@ -11097,11 +11176,16 @@  aarch64_emit_stack_tie (rtx reg)
    events, e.g. if we were to allow the stack to be dropped by more than a page
    and then have multiple probes up and we take a signal somewhere in between
    then the signal handler doesn't know the state of the stack and can make no
-   assumptions about which pages have been probed.  */
+   assumptions about which pages have been probed.
+
+   FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of POLY_SIZE
+   is measured relative to the SME vector length instead of the current
+   prevailing vector length.  It is 0 otherwise.  */
 
 static void
 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 					poly_int64 poly_size,
+					aarch64_feature_flags force_isa_mode,
 					bool frame_related_p,
 					bool final_adjustment_p)
 {
@@ -11143,7 +11227,8 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
   if (known_lt (poly_size, min_probe_threshold)
       || !flag_stack_clash_protection)
     {
-      aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
+      aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
+		      frame_related_p);
       return;
     }
 
@@ -11160,7 +11245,8 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 
       /* First calculate the amount of bytes we're actually spilling.  */
       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
-			  poly_size, temp1, temp2, false, true);
+			  poly_size, temp1, temp2, force_isa_mode,
+			  false, true);
 
       rtx_insn *insn = get_last_insn ();
 
@@ -11218,7 +11304,7 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
     {
       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
 	{
-	  aarch64_sub_sp (NULL, temp2, guard_size, true);
+	  aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
 	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
 					   guard_used_by_caller));
 	  emit_insn (gen_blockage ());
@@ -11229,7 +11315,7 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
     {
       /* Compute the ending address.  */
       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
-			  temp1, NULL, false, true);
+			  temp1, NULL, force_isa_mode, false, true);
       rtx_insn *insn = get_last_insn ();
 
       /* For the initial allocation, we don't have a frame pointer
@@ -11295,7 +11381,7 @@  aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
       if (final_adjustment_p && rounded_size != 0)
 	min_probe_threshold = 0;
 
-      aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
+      aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
       if (residual >= min_probe_threshold)
 	{
 	  if (dump_file)
@@ -11360,6 +11446,14 @@  aarch64_epilogue_uses (int regno)
   return 0;
 }
 
+/* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE.  */
+
+static bool
+aarch64_use_late_prologue_epilogue ()
+{
+  return aarch64_cfun_enables_pstate_sm ();
+}
+
 /* The current function's frame has a save slot for the incoming state
    of SVCR.  Return a legitimate memory for the slot, based on the hard
    frame pointer.  */
@@ -11496,6 +11590,9 @@  aarch64_expand_prologue (void)
   unsigned reg2 = frame.wb_push_candidate2;
   bool emit_frame_chain = frame.emit_frame_chain;
   rtx_insn *insn;
+  aarch64_feature_flags force_isa_mode = 0;
+  if (aarch64_cfun_enables_pstate_sm ())
+    force_isa_mode = AARCH64_FL_SM_ON;
 
   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
     {
@@ -11557,7 +11654,7 @@  aarch64_expand_prologue (void)
      less the amount of the guard reserved for use by the caller's
      outgoing args.  */
   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
-					  true, false);
+					  force_isa_mode, true, false);
 
   if (callee_adjust != 0)
     aarch64_push_regs (reg1, reg2, callee_adjust);
@@ -11580,7 +11677,8 @@  aarch64_expand_prologue (void)
 	gcc_assert (known_eq (chain_offset, 0));
       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
 			  stack_pointer_rtx, chain_offset,
-			  tmp1_rtx, tmp0_rtx, frame_pointer_needed);
+			  tmp1_rtx, tmp0_rtx, force_isa_mode,
+			  frame_pointer_needed);
       if (frame_pointer_needed && !frame_size.is_constant ())
 	{
 	  /* Variable-sized frames need to describe the save slot
@@ -11627,6 +11725,7 @@  aarch64_expand_prologue (void)
 		  || known_eq (initial_adjust, 0));
       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
 					      sve_callee_adjust,
+					      force_isa_mode,
 					      !frame_pointer_needed, false);
       bytes_below_sp -= sve_callee_adjust;
     }
@@ -11639,12 +11738,15 @@  aarch64_expand_prologue (void)
      that is assumed by the called.  */
   gcc_assert (known_eq (bytes_below_sp, final_adjust));
   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
+					  force_isa_mode,
 					  !frame_pointer_needed, true);
   if (emit_frame_chain && maybe_ne (final_adjust, 0))
     aarch64_emit_stack_tie (hard_frame_pointer_rtx);
 
-  /* Save the incoming value of PSTATE.SM, if required.  */
-  if (known_ge (frame.old_svcr_offset, 0))
+  /* Save the incoming value of PSTATE.SM, if required.  Code further
+     down does this for locally-streaming functions.  */
+  if (known_ge (frame.old_svcr_offset, 0)
+      && !aarch64_cfun_enables_pstate_sm ())
     {
       rtx mem = aarch64_old_svcr_mem ();
       MEM_VOLATILE_P (mem) = 1;
@@ -11676,6 +11778,34 @@  aarch64_expand_prologue (void)
 	    emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
 	}
     }
+
+  /* Enable PSTATE.SM, if required.  */
+  if (aarch64_cfun_enables_pstate_sm ())
+    {
+      rtx_insn *guard_label = nullptr;
+      if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
+	{
+	  /* The current function is streaming-compatible.  Save the
+	     original state of PSTATE.SM.  */
+	  rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
+	  emit_insn (gen_aarch64_read_svcr (svcr));
+	  emit_move_insn (aarch64_old_svcr_mem (), svcr);
+	  guard_label = aarch64_guard_switch_pstate_sm (svcr,
+							aarch64_isa_flags);
+	}
+      aarch64_sme_mode_switch_regs args_switch;
+      auto &args = crtl->args.info;
+      for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
+	{
+	  rtx x = args.sme_mode_switch_args[i];
+	  args_switch.add_reg (GET_MODE (x), REGNO (x));
+	}
+      args_switch.emit_prologue ();
+      emit_insn (gen_aarch64_smstart_sm ());
+      args_switch.emit_epilogue ();
+      if (guard_label)
+	emit_label (guard_label);
+    }
 }
 
 /* Return TRUE if we can use a simple_return insn.
@@ -11722,6 +11852,9 @@  aarch64_expand_epilogue (rtx_call_insn *sibcall)
   HOST_WIDE_INT guard_size
     = 1 << param_stack_clash_protection_guard_size;
   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
+  aarch64_feature_flags force_isa_mode = 0;
+  if (aarch64_cfun_enables_pstate_sm ())
+    force_isa_mode = AARCH64_FL_SM_ON;
 
   /* We can re-use the registers when:
 
@@ -11746,6 +11879,24 @@  aarch64_expand_epilogue (rtx_call_insn *sibcall)
     = maybe_ne (get_frame_size ()
 		+ frame.saved_varargs_size, 0);
 
+  /* Reset PSTATE.SM, if required.  */
+  if (aarch64_cfun_enables_pstate_sm ())
+    {
+      rtx_insn *guard_label = nullptr;
+      if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
+	guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
+						      aarch64_isa_flags);
+      aarch64_sme_mode_switch_regs return_switch;
+      if (crtl->return_rtx && REG_P (crtl->return_rtx))
+	return_switch.add_reg (GET_MODE (crtl->return_rtx),
+			       REGNO (crtl->return_rtx));
+      return_switch.emit_prologue ();
+      emit_insn (gen_aarch64_smstop_sm ());
+      return_switch.emit_epilogue ();
+      if (guard_label)
+	emit_label (guard_label);
+    }
+
   /* Emit a barrier to prevent loads from a deallocated stack.  */
   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
       || cfun->calls_alloca
@@ -11766,19 +11917,21 @@  aarch64_expand_epilogue (rtx_call_insn *sibcall)
     aarch64_add_offset (Pmode, stack_pointer_rtx,
 			hard_frame_pointer_rtx,
 			-bytes_below_hard_fp + final_adjust,
-			tmp1_rtx, tmp0_rtx, callee_adjust == 0);
+			tmp1_rtx, tmp0_rtx, force_isa_mode,
+			callee_adjust == 0);
   else
      /* The case where we need to re-use the register here is very rare, so
 	avoid the complicated condition and just always emit a move if the
 	immediate doesn't fit.  */
-     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
+     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
 
   /* Restore the vector registers before the predicate registers,
      so that we can use P4 as a temporary for big-endian SVE frames.  */
   aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
   aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
   if (maybe_ne (sve_callee_adjust, 0))
-    aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
+    aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
+		    force_isa_mode, true);
 
   /* When shadow call stack is enabled, the scs_pop in the epilogue will
      restore x30, we don't need to restore x30 again in the traditional
@@ -11808,7 +11961,7 @@  aarch64_expand_epilogue (rtx_call_insn *sibcall)
 
   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
      add restriction on emit_move optimization to leaf functions.  */
-  aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
+  aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
 		  (!can_inherit_p || !crtl->is_leaf
 		   || df_regs_ever_live_p (EP0_REGNUM)));
 
@@ -11941,7 +12094,8 @@  aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
 
   if (vcall_offset == 0)
-    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
+    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
+			0, false);
   else
     {
       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
@@ -11954,7 +12108,7 @@  aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
 				       plus_constant (Pmode, this_rtx, delta));
 	  else
 	    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
-				temp1, temp0, false);
+				temp1, temp0, 0, false);
 	}
 
       if (Pmode == ptr_mode)
@@ -31355,6 +31509,9 @@  aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_EXTRA_LIVE_ON_ENTRY
 #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
 
+#undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
+#define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
+
 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
 
diff --git a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
index 8b0755014cc..dc5c097bd52 100644
--- a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
+++ b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
@@ -7,3 +7,4 @@  void f4 () __arm_out("za");
 void f5 () __arm_inout("za");
 void f6 () __arm_preserves("za");
 __arm_new("za") void f7 () {}
+__arm_locally_streaming void f8 () {}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
index fcabe3edc55..22f5facfdf9 100644
--- a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
@@ -7,3 +7,4 @@  void f4 () __arm_out("za");
 void f5 () __arm_inout("za");
 void f6 () __arm_preserves("za");
 __arm_new("za") void f7 () {}
+__arm_locally_streaming void f8 () {}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
new file mode 100644
index 00000000000..20ff4b87d94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
@@ -0,0 +1,466 @@ 
+// { dg-options "-O -fomit-frame-pointer" }
+// { dg-final { check-function-bodies "**" "" } }
+
+void consume_za () [[arm::streaming, arm::inout("za")]];
+
+/*
+** n_ls:
+**	sub	sp, sp, #?80
+**	cntd	x16
+**	str	x16, \[sp\]
+**	stp	d8, d9, \[sp, #?16\]
+**	stp	d10, d11, \[sp, #?32\]
+**	stp	d12, d13, \[sp, #?48\]
+**	stp	d14, d15, \[sp, #?64\]
+**	smstart	sm
+**	smstop	sm
+**	ldp	d8, d9, \[sp, #?16\]
+**	ldp	d10, d11, \[sp, #?32\]
+**	ldp	d12, d13, \[sp, #?48\]
+**	ldp	d14, d15, \[sp, #?64\]
+**	add	sp, sp, #?80
+**	ret
+*/
+[[arm::locally_streaming]] void
+n_ls ()
+{
+  asm ("");
+}
+
+/*
+** s_ls:
+**	ret
+*/
+[[arm::locally_streaming]] void
+s_ls () [[arm::streaming]]
+{
+  asm ("");
+}
+
+/*
+** sc_ls:
+**	stp	x29, x30, \[sp, #?-96\]!
+**	mov	x29, sp
+**	cntd	x16
+**	str	x16, \[sp, #?24\]
+**	stp	d8, d9, \[sp, #?32\]
+**	stp	d10, d11, \[sp, #?48\]
+**	stp	d12, d13, \[sp, #?64\]
+**	stp	d14, d15, \[sp, #?80\]
+**	mrs	x16, svcr
+**	str	x16, \[x29, #?16\]
+**	tbnz	x16, 0, [^\n]+
+**	smstart	sm
+**	ldr	x16, \[x29, #?16\]
+**	tbnz	x16, 0, [^\n]+
+**	smstop	sm
+**	ldp	d8, d9, \[sp, #?32\]
+**	ldp	d10, d11, \[sp, #?48\]
+**	ldp	d12, d13, \[sp, #?64\]
+**	ldp	d14, d15, \[sp, #?80\]
+**	ldp	x29, x30, \[sp\], #?96
+**	ret
+*/
+[[arm::locally_streaming]] void
+sc_ls () [[arm::streaming_compatible]]
+{
+  asm ("");
+}
+
+/*
+** n_ls_new_za:
+**	str	x30, \[sp, #?-80\]!
+**	cntd	x16
+**	str	x16, \[sp, #?8\]
+**	stp	d8, d9, \[sp, #?16\]
+**	stp	d10, d11, \[sp, #?32\]
+**	stp	d12, d13, \[sp, #?48\]
+**	stp	d14, d15, \[sp, #?64\]
+**	smstart	sm
+**	mrs	(x[0-9]+), tpidr2_el0
+**	cbz	\1, [^\n]+
+**	bl	__arm_tpidr2_save
+**	msr	tpidr2_el0, xzr
+**	zero	{ za }
+**	smstart	za
+**	bl	consume_za
+**	smstop	za
+**	smstop	sm
+**	ldp	d8, d9, \[sp, #?16\]
+**	ldp	d10, d11, \[sp, #?32\]
+**	ldp	d12, d13, \[sp, #?48\]
+**	ldp	d14, d15, \[sp, #?64\]
+**	ldr	x30, \[sp\], #?80
+**	ret
+*/
+[[arm::locally_streaming, arm::new("za")]] void
+n_ls_new_za ()
+{
+  consume_za ();
+  asm ("");
+}
+
+/*
+** s_ls_new_za:
+**	str	x30, \[sp, #?-16\]!
+**	mrs	(x[0-9]+), tpidr2_el0
+**	cbz	\1, [^\n]+
+**	bl	__arm_tpidr2_save
+**	msr	tpidr2_el0, xzr
+**	zero	{ za }
+**	smstart	za
+**	bl	consume_za
+**	smstop	za
+**	ldr	x30, \[sp\], #?16
+**	ret
+*/
+[[arm::locally_streaming, arm::new("za")]] void
+s_ls_new_za () [[arm::streaming]]
+{
+  consume_za ();
+  asm ("");
+}
+
+/*
+** sc_ls_new_za:
+**	stp	x29, x30, \[sp, #?-96\]!
+**	mov	x29, sp
+**	cntd	x16
+**	str	x16, \[sp, #?24\]
+**	stp	d8, d9, \[sp, #?32\]
+**	stp	d10, d11, \[sp, #?48\]
+**	stp	d12, d13, \[sp, #?64\]
+**	stp	d14, d15, \[sp, #?80\]
+**	mrs	x16, svcr
+**	str	x16, \[x29, #?16\]
+**	tbnz	x16, 0, [^\n]+
+**	smstart	sm
+**	mrs	(x[0-9]+), tpidr2_el0
+**	cbz	\1, [^\n]+
+**	bl	__arm_tpidr2_save
+**	msr	tpidr2_el0, xzr
+**	zero	{ za }
+**	smstart	za
+**	bl	consume_za
+**	smstop	za
+**	ldr	x16, \[x29, #?16\]
+**	tbnz	x16, 0, [^\n]+
+**	smstop	sm
+**	ldp	d8, d9, \[sp, #?32\]
+**	ldp	d10, d11, \[sp, #?48\]
+**	ldp	d12, d13, \[sp, #?64\]
+**	ldp	d14, d15, \[sp, #?80\]
+**	ldp	x29, x30, \[sp\], #?96
+**	ret
+*/
+[[arm::locally_streaming, arm::new("za")]] void
+sc_ls_new_za () [[arm::streaming_compatible]]
+{
+  consume_za ();
+  asm ("");
+}
+
+/*
+** n_ls_shared_za:
+**	str	x30, \[sp, #?-80\]!
+**	cntd	x16
+**	str	x16, \[sp, #?8\]
+**	stp	d8, d9, \[sp, #?16\]
+**	stp	d10, d11, \[sp, #?32\]
+**	stp	d12, d13, \[sp, #?48\]
+**	stp	d14, d15, \[sp, #?64\]
+**	smstart	sm
+**	bl	consume_za
+**	smstop	sm
+**	ldp	d8, d9, \[sp, #?16\]
+**	ldp	d10, d11, \[sp, #?32\]
+**	ldp	d12, d13, \[sp, #?48\]
+**	ldp	d14, d15, \[sp, #?64\]
+**	ldr	x30, \[sp\], #?80
+**	ret
+*/
+[[arm::locally_streaming]] void
+n_ls_shared_za () [[arm::inout("za")]]
+{
+  consume_za ();
+  asm ("");
+}
+
+/*
+** s_ls_shared_za:
+**	str	x30, \[sp, #?-16\]!
+**	bl	consume_za
+**	ldr	x30, \[sp\], #?16
+**	ret
+*/
+[[arm::locally_streaming]] void
+s_ls_shared_za () [[arm::streaming, arm::inout("za")]]
+{
+  consume_za ();
+  asm ("");
+}
+
+/*
+** sc_ls_shared_za:
+**	stp	x29, x30, \[sp, #?-96\]!
+**	mov	x29, sp
+**	cntd	x16
+**	str	x16, \[sp, #?24\]
+**	stp	d8, d9, \[sp, #?32\]
+**	stp	d10, d11, \[sp, #?48\]
+**	stp	d12, d13, \[sp, #?64\]
+**	stp	d14, d15, \[sp, #?80\]
+**	mrs	x16, svcr
+**	str	x16, \[x29, #?16\]
+**	tbnz	x16, 0, [^\n]+
+**	smstart	sm
+**	bl	consume_za
+**	ldr	x16, \[x29, #?16\]
+**	tbnz	x16, 0, [^\n]+
+**	smstop	sm
+**	ldp	d8, d9, \[sp, #?32\]
+**	ldp	d10, d11, \[sp, #?48\]
+**	ldp	d12, d13, \[sp, #?64\]
+**	ldp	d14, d15, \[sp, #?80\]
+**	ldp	x29, x30, \[sp\], #?96
+**	ret
+*/
+[[arm::locally_streaming]] void
+sc_ls_shared_za () [[arm::streaming_compatible, arm::inout("za")]]
+{
+  consume_za ();
+  asm ("");
+}
+
+/*
+** n_ls_vector_pcs:
+**	sub	sp, sp, #?272
+**	cntd	x16
+**	str	x16, \[sp\]
+**	stp	q8, q9, \[sp, #?16\]
+**	stp	q10, q11, \[sp, #?48\]
+**	stp	q12, q13, \[sp, #?80\]
+**	stp	q14, q15, \[sp, #?112\]
+**	stp	q16, q17, \[sp, #?144\]
+**	stp	q18, q19, \[sp, #?176\]
+**	stp	q20, q21, \[sp, #?208\]
+**	stp	q22, q23, \[sp, #?240\]
+**	smstart	sm
+**	smstop	sm
+**	ldp	q8, q9, \[sp, #?16\]
+**	ldp	q10, q11, \[sp, #?48\]
+**	ldp	q12, q13, \[sp, #?80\]
+**	ldp	q14, q15, \[sp, #?112\]
+**	ldp	q16, q17, \[sp, #?144\]
+**	ldp	q18, q19, \[sp, #?176\]
+**	ldp	q20, q21, \[sp, #?208\]
+**	ldp	q22, q23, \[sp, #?240\]
+**	add	sp, sp, #?272
+**	ret
+*/
+[[arm::locally_streaming]] void __attribute__((aarch64_vector_pcs))
+n_ls_vector_pcs ()
+{
+  asm ("");
+}
+
+/*
+** n_ls_sve_pcs:
+**	sub	sp, sp, #?16
+**	cntd	x16
+**	str	x16, \[sp\]
+**	addsvl	sp, sp, #-18
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	p12, \[sp, #8, mul vl\]
+**	str	p13, \[sp, #9, mul vl\]
+**	str	p14, \[sp, #10, mul vl\]
+**	str	p15, \[sp, #11, mul vl\]
+**	str	z8, \[sp, #2, mul vl\]
+**	str	z9, \[sp, #3, mul vl\]
+**	str	z10, \[sp, #4, mul vl\]
+**	str	z11, \[sp, #5, mul vl\]
+**	str	z12, \[sp, #6, mul vl\]
+**	str	z13, \[sp, #7, mul vl\]
+**	str	z14, \[sp, #8, mul vl\]
+**	str	z15, \[sp, #9, mul vl\]
+**	str	z16, \[sp, #10, mul vl\]
+**	str	z17, \[sp, #11, mul vl\]
+**	str	z18, \[sp, #12, mul vl\]
+**	str	z19, \[sp, #13, mul vl\]
+**	str	z20, \[sp, #14, mul vl\]
+**	str	z21, \[sp, #15, mul vl\]
+**	str	z22, \[sp, #16, mul vl\]
+**	str	z23, \[sp, #17, mul vl\]
+**	addvl	sp, sp, #-1
+**	str	p0, \[sp\]
+**	smstart	sm
+**	ldr	p0, \[sp\]
+**	addvl	sp, sp, #1
+**	smstop	sm
+**	ldr	z8, \[sp, #2, mul vl\]
+**	ldr	z9, \[sp, #3, mul vl\]
+**	ldr	z10, \[sp, #4, mul vl\]
+**	ldr	z11, \[sp, #5, mul vl\]
+**	ldr	z12, \[sp, #6, mul vl\]
+**	ldr	z13, \[sp, #7, mul vl\]
+**	ldr	z14, \[sp, #8, mul vl\]
+**	ldr	z15, \[sp, #9, mul vl\]
+**	ldr	z16, \[sp, #10, mul vl\]
+**	ldr	z17, \[sp, #11, mul vl\]
+**	ldr	z18, \[sp, #12, mul vl\]
+**	ldr	z19, \[sp, #13, mul vl\]
+**	ldr	z20, \[sp, #14, mul vl\]
+**	ldr	z21, \[sp, #15, mul vl\]
+**	ldr	z22, \[sp, #16, mul vl\]
+**	ldr	z23, \[sp, #17, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	ldr	p12, \[sp, #8, mul vl\]
+**	ldr	p13, \[sp, #9, mul vl\]
+**	ldr	p14, \[sp, #10, mul vl\]
+**	ldr	p15, \[sp, #11, mul vl\]
+**	addsvl	sp, sp, #18
+**	add	sp, sp, #?16
+**	ret
+*/
+[[arm::locally_streaming]] void
+n_ls_sve_pcs (__SVBool_t x)
+{
+  asm ("");
+}
+
+/*
+** n_ls_v0:
+**	addsvl	sp, sp, #-1
+**	...
+**	smstart	sm
+**	add	x[0-9]+, [^\n]+
+**	smstop	sm
+**	...
+**	addsvl	sp, sp, #1
+**	...
+*/
+#define TEST(VN) __SVInt32_t VN; asm ("" :: "r" (&VN));
+[[arm::locally_streaming]] void
+n_ls_v0 ()
+{
+  TEST (v0);
+}
+
+/*
+** n_ls_v32:
+**	addsvl	sp, sp, #-32
+**	...
+**	smstart	sm
+**	...
+**	smstop	sm
+**	...
+**	rdsvl	(x[0-9]+), #1
+**	lsl	(x[0-9]+), \1, #?5
+**	add	sp, sp, \2
+**	...
+*/
+[[arm::locally_streaming]] void
+n_ls_v32 ()
+{
+  TEST (v0);
+  TEST (v1);
+  TEST (v2);
+  TEST (v3);
+  TEST (v4);
+  TEST (v5);
+  TEST (v6);
+  TEST (v7);
+  TEST (v8);
+  TEST (v9);
+  TEST (v10);
+  TEST (v11);
+  TEST (v12);
+  TEST (v13);
+  TEST (v14);
+  TEST (v15);
+  TEST (v16);
+  TEST (v17);
+  TEST (v18);
+  TEST (v19);
+  TEST (v20);
+  TEST (v21);
+  TEST (v22);
+  TEST (v23);
+  TEST (v24);
+  TEST (v25);
+  TEST (v26);
+  TEST (v27);
+  TEST (v28);
+  TEST (v29);
+  TEST (v30);
+  TEST (v31);
+}
+
+/*
+** n_ls_v33:
+**	rdsvl	(x[0-9]+), #1
+**	mov	(x[0-9]+), #?33
+**	mul	(x[0-9]+), (?:\1, \2|\2, \1)
+**	sub	sp, sp, \3
+**	...
+**	smstart	sm
+**	...
+**	smstop	sm
+**	...
+**	rdsvl	(x[0-9]+), #1
+**	mov	(x[0-9]+), #?33
+**	mul	(x[0-9]+), (?:\4, \5|\5, \4)
+**	add	sp, sp, \6
+**	...
+*/
+[[arm::locally_streaming]] void
+n_ls_v33 ()
+{
+  TEST (v0);
+  TEST (v1);
+  TEST (v2);
+  TEST (v3);
+  TEST (v4);
+  TEST (v5);
+  TEST (v6);
+  TEST (v7);
+  TEST (v8);
+  TEST (v9);
+  TEST (v10);
+  TEST (v11);
+  TEST (v12);
+  TEST (v13);
+  TEST (v14);
+  TEST (v15);
+  TEST (v16);
+  TEST (v17);
+  TEST (v18);
+  TEST (v19);
+  TEST (v20);
+  TEST (v21);
+  TEST (v22);
+  TEST (v23);
+  TEST (v24);
+  TEST (v25);
+  TEST (v26);
+  TEST (v27);
+  TEST (v28);
+  TEST (v29);
+  TEST (v30);
+  TEST (v31);
+  TEST (v32);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
new file mode 100644
index 00000000000..0eba993855f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
@@ -0,0 +1,177 @@ 
+// { dg-options "-O -fomit-frame-pointer" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+/*
+** test_d0:
+**	...
+**	smstart	sm
+**	...
+**	fmov	x10, d0
+**	smstop	sm
+**	fmov	d0, x10
+**	...
+*/
+[[arm::locally_streaming]] double
+test_d0 ()
+{
+  asm ("");
+  return 1.0f;
+}
+
+/*
+** test_d0_vec:
+**	...
+**	smstart	sm
+**	...
+** (
+**	fmov	x10, d0
+** |
+**	umov	x10, v0.d\[0\]
+** )
+**	smstop	sm
+**	fmov	d0, x10
+**	...
+*/
+[[arm::locally_streaming]] int8x8_t
+test_d0_vec ()
+{
+  asm ("");
+  return (int8x8_t) {};
+}
+
+/*
+** test_q0:
+**	...
+**	smstart	sm
+**	...
+**	str	q0, \[sp, #?-16\]!
+**	smstop	sm
+**	ldr	q0, \[sp\], #?16
+**	...
+*/
+[[arm::locally_streaming]] int8x16_t
+test_q0 ()
+{
+  asm ("");
+  return (int8x16_t) {};
+}
+
+/*
+** test_q1:
+**	...
+**	smstart	sm
+**	...
+**	stp	q0, q1, \[sp, #?-32\]!
+**	smstop	sm
+**	ldp	q0, q1, \[sp\], #?32
+**	...
+*/
+[[arm::locally_streaming]] int8x16x2_t
+test_q1 ()
+{
+  asm ("");
+  return (int8x16x2_t) {};
+}
+
+/*
+** test_q2:
+**	...
+**	smstart	sm
+**	...
+**	stp	q0, q1, \[sp, #?-48\]!
+**	str	q2, \[sp, #?32\]
+**	smstop	sm
+**	ldr	q2, \[sp, #?32\]
+**	ldp	q0, q1, \[sp\], #?48
+**	...
+*/
+[[arm::locally_streaming]] int8x16x3_t
+test_q2 ()
+{
+  asm ("");
+  return (int8x16x3_t) {};
+}
+
+/*
+** test_q3:
+**	...
+**	smstart	sm
+**	...
+**	stp	q0, q1, \[sp, #?-64\]!
+**	stp	q2, q3, \[sp, #?32\]
+**	smstop	sm
+**	ldp	q2, q3, \[sp, #?32\]
+**	ldp	q0, q1, \[sp\], #?64
+**	...
+*/
+[[arm::locally_streaming]] int8x16x4_t
+test_q3 ()
+{
+  asm ("");
+  return (int8x16x4_t) {};
+}
+
+/*
+** test_z0:
+**	...
+**	smstart	sm
+**	mov	z0\.b, #0
+**	addvl	sp, sp, #-1
+**	str	z0, \[sp\]
+**	smstop	sm
+**	ldr	z0, \[sp\]
+**	addvl	sp, sp, #1
+**	...
+*/
+[[arm::locally_streaming]] svint8_t
+test_z0 ()
+{
+  asm ("");
+  return (svint8_t) {};
+}
+
+/*
+** test_z3:
+**	...
+**	smstart	sm
+**	...
+**	addvl	sp, sp, #-4
+**	str	z0, \[sp\]
+**	str	z1, \[sp, #1, mul vl\]
+**	str	z2, \[sp, #2, mul vl\]
+**	str	z3, \[sp, #3, mul vl\]
+**	smstop	sm
+**	ldr	z0, \[sp\]
+**	ldr	z1, \[sp, #1, mul vl\]
+**	ldr	z2, \[sp, #2, mul vl\]
+**	ldr	z3, \[sp, #3, mul vl\]
+**	...
+*/
+[[arm::locally_streaming]] svint8x4_t
+test_z3 ()
+{
+  asm ("");
+  return (svint8x4_t) {};
+}
+
+/*
+** test_p0:
+**	...
+**	smstart	sm
+**	pfalse	p0\.b
+**	addvl	sp, sp, #-1
+**	str	p0, \[sp\]
+**	smstop	sm
+**	ldr	p0, \[sp\]
+**	addvl	sp, sp, #1
+**	...
+*/
+[[arm::locally_streaming]] svbool_t
+test_p0 ()
+{
+  asm ("");
+  return (svbool_t) {};
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
new file mode 100644
index 00000000000..2bdea6ac631
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
@@ -0,0 +1,273 @@ 
+// { dg-options "-O -fomit-frame-pointer" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+/*
+** test_d0:
+**	...
+**	fmov	x10, d0
+**	smstart	sm
+**	fmov	d0, x10
+**	smstop	sm
+**	...
+*/
+[[arm::locally_streaming]] void
+test_d0 (double d0)
+{
+  asm ("");
+}
+
+/*
+** test_d7:
+**	...
+**	fmov	x10, d0
+**	fmov	x11, d1
+**	fmov	x12, d2
+**	fmov	x13, d3
+**	fmov	x14, d4
+**	fmov	x15, d5
+**	fmov	x16, d6
+**	fmov	x17, d7
+**	smstart	sm
+**	fmov	d0, x10
+**	fmov	d1, x11
+**	fmov	d2, x12
+**	fmov	d3, x13
+**	fmov	d4, x14
+**	fmov	d5, x15
+**	fmov	d6, x16
+**	fmov	d7, x17
+**	smstop	sm
+**	...
+*/
+[[arm::locally_streaming]] void
+test_d7 (double d0, double d1, double d2, double d3,
+	 double d4, double d5, double d6, double d7)
+{
+  asm ("");
+}
+
+/*
+** test_d0_vec:
+**	...
+** (
+**	fmov	x10, d0
+** |
+**	umov	x10, v0.d\[0\]
+** )
+**	smstart	sm
+**	fmov	d0, x10
+**	smstop	sm
+**	...
+*/
+[[arm::locally_streaming]] void
+test_d0_vec (int8x8_t d0)
+{
+  asm ("");
+}
+
+/*
+** test_d7_vec:
+**	...
+** (
+**	fmov	x10, d0
+**	fmov	x11, d1
+**	fmov	x12, d2
+**	fmov	x13, d3
+**	fmov	x14, d4
+**	fmov	x15, d5
+**	fmov	x16, d6
+**	fmov	x17, d7
+** |
+**	umov	x10, v0.d\[0\]
+**	umov	x11, v1.d\[0\]
+**	umov	x12, v2.d\[0\]
+**	umov	x13, v3.d\[0\]
+**	umov	x14, v4.d\[0\]
+**	umov	x15, v5.d\[0\]
+**	umov	x16, v6.d\[0\]
+**	umov	x17, v7.d\[0\]
+** )
+**	smstart	sm
+**	fmov	d0, x10
+**	fmov	d1, x11
+**	fmov	d2, x12
+**	fmov	d3, x13
+**	fmov	d4, x14
+**	fmov	d5, x15
+**	fmov	d6, x16
+**	fmov	d7, x17
+**	smstop	sm
+**	...
+*/
+[[arm::locally_streaming]] void
+test_d7_vec (int8x8_t d0, int8x8_t d1, int8x8_t d2, int8x8_t d3,
+	     int8x8_t d4, int8x8_t d5, int8x8_t d6, int8x8_t d7)
+{
+  asm ("");
+}
+
+/*
+** test_q0:
+**	...
+**	str	q0, \[sp, #?-16\]!
+**	smstart	sm
+**	ldr	q0, \[sp\], #?16
+**	smstop	sm
+**	...
+*/
+[[arm::locally_streaming]] void
+test_q0 (int8x16_t q0)
+{
+  asm ("");
+}
+
+/*
+** test_q7:
+**	...
+**	stp	q0, q1, \[sp, #?-128\]!
+**	stp	q2, q3, \[sp, #?32\]
+**	stp	q4, q5, \[sp, #?64\]
+**	stp	q6, q7, \[sp, #?96\]
+**	smstart	sm
+**	ldp	q2, q3, \[sp, #?32\]
+**	ldp	q4, q5, \[sp, #?64\]
+**	ldp	q6, q7, \[sp, #?96\]
+**	ldp	q0, q1, \[sp\], #?128
+**	smstop	sm
+**	...
+*/
+[[arm::locally_streaming]] void
+test_q7 (int8x16x4_t q0, int8x16x4_t q4)
+{
+  asm ("");
+}
+
+/*
+** test_z0:
+**	...
+**	addvl	sp, sp, #-1
+**	str	z0, \[sp\]
+**	smstart	sm
+**	ldr	z0, \[sp\]
+**	addvl	sp, sp, #1
+**	smstop	sm
+**	...
+*/
+[[arm::locally_streaming]] void
+test_z0 (svint8_t z0)
+{
+  asm ("");
+}
+
+/*
+** test_z7:
+**	...
+**	addvl	sp, sp, #-8
+**	str	z0, \[sp\]
+**	str	z1, \[sp, #1, mul vl\]
+**	str	z2, \[sp, #2, mul vl\]
+**	str	z3, \[sp, #3, mul vl\]
+**	str	z4, \[sp, #4, mul vl\]
+**	str	z5, \[sp, #5, mul vl\]
+**	str	z6, \[sp, #6, mul vl\]
+**	str	z7, \[sp, #7, mul vl\]
+**	smstart	sm
+**	ldr	z0, \[sp\]
+**	ldr	z1, \[sp, #1, mul vl\]
+**	ldr	z2, \[sp, #2, mul vl\]
+**	ldr	z3, \[sp, #3, mul vl\]
+**	ldr	z4, \[sp, #4, mul vl\]
+**	ldr	z5, \[sp, #5, mul vl\]
+**	ldr	z6, \[sp, #6, mul vl\]
+**	ldr	z7, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #8
+**	smstop	sm
+**	...
+*/
+[[arm::locally_streaming]] void
+test_z7 (svint8x4_t z0, svint8x4_t z4)
+{
+  asm ("");
+}
+
+/*
+** test_p0:
+**	...
+**	addvl	sp, sp, #-1
+**	str	p0, \[sp\]
+**	smstart	sm
+**	ldr	p0, \[sp\]
+**	addvl	sp, sp, #1
+**	smstop	sm
+**	...
+*/
+[[arm::locally_streaming]] void
+test_p0 (svbool_t p0)
+{
+  asm ("");
+}
+
+/*
+** test_p3:
+**	...
+**	addvl	sp, sp, #-1
+**	str	p0, \[sp\]
+**	str	p1, \[sp, #1, mul vl\]
+**	str	p2, \[sp, #2, mul vl\]
+**	str	p3, \[sp, #3, mul vl\]
+**	smstart	sm
+**	ldr	p0, \[sp\]
+**	ldr	p1, \[sp, #1, mul vl\]
+**	ldr	p2, \[sp, #2, mul vl\]
+**	ldr	p3, \[sp, #3, mul vl\]
+**	addvl	sp, sp, #1
+**	smstop	sm
+**	...
+*/
+[[arm::locally_streaming]] void
+test_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm ("");
+}
+
+/*
+** test_mixed:
+**	...
+**	addvl	sp, sp, #-3
+**	str	p0, \[sp\]
+**	str	p1, \[sp, #1, mul vl\]
+**	str	p2, \[sp, #2, mul vl\]
+**	str	p3, \[sp, #3, mul vl\]
+**	str	z3, \[sp, #1, mul vl\]
+**	str	z7, \[sp, #2, mul vl\]
+**	stp	q2, q6, \[sp, #?-32\]!
+**	fmov	w10, s0
+**	fmov	x11, d1
+**	fmov	w12, s4
+**	fmov	x13, d5
+**	smstart	sm
+**	fmov	s0, w10
+**	fmov	d1, x11
+**	fmov	s4, w12
+**	fmov	d5, x13
+**	ldp	q2, q6, \[sp\], #?32
+**	ldr	p0, \[sp\]
+**	ldr	p1, \[sp, #1, mul vl\]
+**	ldr	p2, \[sp, #2, mul vl\]
+**	ldr	p3, \[sp, #3, mul vl\]
+**	ldr	z3, \[sp, #1, mul vl\]
+**	ldr	z7, \[sp, #2, mul vl\]
+**	addvl	sp, sp, #3
+**	smstop	sm
+**	...
+*/
+[[arm::locally_streaming]] void
+test_mixed (float s0, double d1, float32x4_t q2, svfloat32_t z3,
+	    float s4, double d5, float64x2_t q6, svfloat64_t z7,
+	    svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm ("");
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c
new file mode 100644
index 00000000000..42adeb152e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c
@@ -0,0 +1,145 @@ 
+// { dg-options "-O -fomit-frame-pointer" }
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+/*
+** test_d0:
+**	...
+**	smstart	sm
+**	...
+**	fmov	x10, d0
+**	smstop	sm
+**	fmov	d0, x10
+**	...
+**	smstart	sm
+**	...
+**	smstop	sm
+**	...
+*/
+void consume_d0 (double d0);
+
+__arm_locally_streaming void
+test_d0 ()
+{
+  asm ("");
+  consume_d0 (1.0);
+  asm ("");
+}
+
+/*
+** test_d7:
+**	...
+**	fmov	x10, d0
+**	fmov	x11, d1
+**	fmov	x12, d2
+**	fmov	x13, d3
+**	fmov	x14, d4
+**	fmov	x15, d5
+**	fmov	x16, d6
+**	fmov	x17, d7
+**	smstop	sm
+**	fmov	d0, x10
+**	fmov	d1, x11
+**	fmov	d2, x12
+**	fmov	d3, x13
+**	fmov	d4, x14
+**	fmov	d5, x15
+**	fmov	d6, x16
+**	fmov	d7, x17
+**	...
+*/
+void consume_d7 (double d0, double d1, double d2, double d3,
+		 double d4, double d5, double d6, double d7);
+__arm_locally_streaming void
+test_d7 ()
+{
+  asm ("");
+  consume_d7 (1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+  asm ("");
+}
+
+/*
+** test_q7:
+**	...
+**	stp	q0, q1, \[sp, #?-128\]!
+**	stp	q2, q3, \[sp, #?32\]
+**	stp	q4, q5, \[sp, #?64\]
+**	stp	q6, q7, \[sp, #?96\]
+**	smstop	sm
+**	ldp	q2, q3, \[sp, #?32\]
+**	ldp	q4, q5, \[sp, #?64\]
+**	ldp	q6, q7, \[sp, #?96\]
+**	ldp	q0, q1, \[sp\], #?128
+**	...
+*/
+void consume_q7 (int8x16x4_t q0, int8x16x4_t q4);
+
+__arm_locally_streaming void
+test_q7 (int8x16x4_t *ptr)
+{
+  asm ("");
+  consume_q7 (ptr[0], ptr[1]);
+  asm ("");
+}
+
+/*
+** test_z7:
+**	...
+**	addvl	sp, sp, #-8
+**	str	z0, \[sp\]
+**	str	z1, \[sp, #1, mul vl\]
+**	str	z2, \[sp, #2, mul vl\]
+**	str	z3, \[sp, #3, mul vl\]
+**	str	z4, \[sp, #4, mul vl\]
+**	str	z5, \[sp, #5, mul vl\]
+**	str	z6, \[sp, #6, mul vl\]
+**	str	z7, \[sp, #7, mul vl\]
+**	smstop	sm
+**	ldr	z0, \[sp\]
+**	ldr	z1, \[sp, #1, mul vl\]
+**	ldr	z2, \[sp, #2, mul vl\]
+**	ldr	z3, \[sp, #3, mul vl\]
+**	ldr	z4, \[sp, #4, mul vl\]
+**	ldr	z5, \[sp, #5, mul vl\]
+**	ldr	z6, \[sp, #6, mul vl\]
+**	ldr	z7, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #8
+**	...
+*/
+void consume_z7 (svint8x4_t z0, svint8x4_t z4);
+
+__arm_locally_streaming void
+test_z7 (svint8x4_t *ptr1, svint8x4_t *ptr2)
+{
+  asm ("");
+  consume_z7 (*ptr1, *ptr2);
+  asm ("");
+}
+
+/*
+** test_p3:
+**	...
+**	addvl	sp, sp, #-1
+**	str	p0, \[sp\]
+**	str	p1, \[sp, #1, mul vl\]
+**	str	p2, \[sp, #2, mul vl\]
+**	str	p3, \[sp, #3, mul vl\]
+**	smstop	sm
+**	ldr	p0, \[sp\]
+**	ldr	p1, \[sp, #1, mul vl\]
+**	ldr	p2, \[sp, #2, mul vl\]
+**	ldr	p3, \[sp, #3, mul vl\]
+**	addvl	sp, sp, #1
+**	...
+*/
+void consume_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3);
+
+__arm_locally_streaming void
+test_p3 (svbool_t *ptr1, svbool_t *ptr2, svbool_t *ptr3, svbool_t *ptr4)
+{
+  asm ("");
+  consume_p3 (*ptr1, *ptr2, *ptr3, *ptr4);
+  asm ("");
+}