[pushed] aarch64: Fix streaming-compatible code with -mtrack-speculation [PR113805]

Message ID mptwmqzl6en.fsf@arm.com
State Committed
Commit 98702303e2b0e540b25c94805ca015688f78250e
Headers
Series [pushed] aarch64: Fix streaming-compatible code with -mtrack-speculation [PR113805] |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm warning Patch is already merged
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 warning Patch is already merged

Commit Message

Richard Sandiford Feb. 20, 2024, 11:30 a.m. UTC
  This patch makes -mtrack-speculation work on streaming-compatible
functions.  There were two related issues.  The first is that the
streaming-compatible code was using TB(N)Z unconditionally, whereas
those instructions are not allowed with speculation tracking.
That part can be fixed in a similar way to the recent eh_return
fix (PR112987).

The second issue was that the speculation-tracking pass runs
before some of the conditional branches are inserted.  It isn't
safe to insert the branches any earlier, so the patch instead adds
a second speculation-tracking pass that runs afterwards.  The new
pass is only used for streaming-compatible functions.

The testcase is adapted from call_sm_switch_1.c.

Tested on aarch64-linux-gnu & pushed.

Richard


gcc/
	PR target/113805
	* config/aarch64/aarch64-passes.def (pass_late_track_speculation):
	New pass.
	* config/aarch64/aarch64-protos.h (make_pass_late_track_speculation):
	Declare.
	* config/aarch64/aarch64.md (is_call): New attribute.
	(*and<mode>3nr_compare0): Rename to...
	(@aarch64_and<mode>3nr_compare0): ...this.
	* config/aarch64/aarch64-sme.md (aarch64_get_sme_state)
	(aarch64_tpidr2_save, aarch64_tpidr2_restore): Add is_call attributes.
	* config/aarch64/aarch64-speculation.cc: Update file comment to
	describe the new late pass.
	(aarch64_do_track_speculation): Handle is_call insns like other calls.
	(pass_track_speculation): Add an is_late member variable.
	(pass_track_speculation::gate): Run the late pass for streaming-
	compatible functions and the early pass for other functions.
	(make_pass_track_speculation): Update accordingly.
	(make_pass_late_track_speculation): New function.
	* config/aarch64/aarch64.cc (aarch64_gen_test_and_branch): New
	function.
	(aarch64_guard_switch_pstate_sm): Use it.

gcc/testsuite/
	PR target/113805
	* gcc.target/aarch64/sme/call_sm_switch_11.c: New test.
---
 gcc/config/aarch64/aarch64-passes.def         |   1 +
 gcc/config/aarch64/aarch64-protos.h           |   1 +
 gcc/config/aarch64/aarch64-sme.md             |   3 +
 gcc/config/aarch64/aarch64-speculation.cc     |  64 ++++--
 gcc/config/aarch64/aarch64.cc                 |  26 ++-
 gcc/config/aarch64/aarch64.md                 |   8 +-
 .../aarch64/sme/call_sm_switch_11.c           | 209 ++++++++++++++++++
 7 files changed, 291 insertions(+), 21 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_11.c
  

Patch

diff --git a/gcc/config/aarch64/aarch64-passes.def b/gcc/config/aarch64/aarch64-passes.def
index 769d48f4faa..e20e6f4395b 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -22,6 +22,7 @@  INSERT_PASS_BEFORE (pass_sched, 1, pass_aarch64_early_ra);
 INSERT_PASS_AFTER (pass_regrename, 1, pass_fma_steering);
 INSERT_PASS_BEFORE (pass_reorder_blocks, 1, pass_track_speculation);
 INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 1, pass_switch_pstate_sm);
+INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 1, pass_late_track_speculation);
 INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
 INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
 INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index a0b142e0b94..bd719b992a5 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1075,6 +1075,7 @@  std::string aarch64_get_extension_string_for_isa_flags (aarch64_feature_flags,
 rtl_opt_pass *make_pass_aarch64_early_ra (gcc::context *);
 rtl_opt_pass *make_pass_fma_steering (gcc::context *);
 rtl_opt_pass *make_pass_track_speculation (gcc::context *);
+rtl_opt_pass *make_pass_late_track_speculation (gcc::context *);
 rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *);
 rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt);
 rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt);
diff --git a/gcc/config/aarch64/aarch64-sme.md b/gcc/config/aarch64/aarch64-sme.md
index 6edcdd96679..81d941871ac 100644
--- a/gcc/config/aarch64/aarch64-sme.md
+++ b/gcc/config/aarch64/aarch64-sme.md
@@ -105,6 +105,7 @@  (define_insn "aarch64_get_sme_state"
    (clobber (reg:CC CC_REGNUM))]
   ""
   "bl\t__arm_sme_state"
+  [(set_attr "is_call" "yes")]
 )
 
 (define_insn "aarch64_read_svcr"
@@ -242,6 +243,7 @@  (define_insn "aarch64_tpidr2_save"
    (clobber (reg:CC CC_REGNUM))]
   ""
   "bl\t__arm_tpidr2_save"
+  [(set_attr "is_call" "yes")]
 )
 
 ;; Set PSTATE.ZA to 1.  If ZA was previously dormant or active,
@@ -358,6 +360,7 @@  (define_insn "aarch64_tpidr2_restore"
    (clobber (reg:CC CC_REGNUM))]
   ""
   "bl\t__arm_tpidr2_restore"
+  [(set_attr "is_call" "yes")]
 )
 
 ;; Check whether a lazy save set up by aarch64_save_za was committed
diff --git a/gcc/config/aarch64/aarch64-speculation.cc b/gcc/config/aarch64/aarch64-speculation.cc
index fdcc544b4d6..54f5ada7ed6 100644
--- a/gcc/config/aarch64/aarch64-speculation.cc
+++ b/gcc/config/aarch64/aarch64-speculation.cc
@@ -39,10 +39,10 @@ 
 #include "insn-config.h"
 #include "recog.h"
 
-/* This pass scans the RTL just before the final branch
-   re-organisation pass.  The aim is to identify all places where
-   there is conditional control flow and to insert code that tracks
-   any speculative execution of a conditional branch.
+/* This pass scans the RTL insns late in the RTL pipeline.  The aim is
+   to identify all places where there is conditional control flow and
+   to insert code that tracks any speculative execution of a conditional
+   branch.
 
    To do this we reserve a call-clobbered register (so that it can be
    initialized very early in the function prologue) that can then be
@@ -131,11 +131,22 @@ 
    carry the tracking state in SP for this period of time unless the
    tracker value is needed at that point in time.
 
-   We run the pass just before the final branch reorganization pass so
-   that we can handle most of the conditional branch cases using the
-   standard edge insertion code.  The reorg pass will hopefully clean
-   things up for afterwards so that the results aren't too
-   horrible.  */
+   We run the pass while the CFG is still present so that we can handle
+   most of the conditional branch cases using the standard edge insertion
+   code.  Where possible, we prefer to run the pass just before the final
+   branch reorganization pass.  That pass will then hopefully clean things
+   up afterwards so that the results aren't too horrible.
+
+   However, we must run the pass after all conditional branches have
+   been inserted.  switch_pstate_sm inserts conditional branches for
+   streaming-compatible code, and so for streaming-compatible functions,
+   this pass must run after that one.
+
+   We handle this by having two copies of the pass: the normal one that
+   runs before branch reorganization, and a "late" one that runs just
+   before late_thread_prologue_and_epilogue.  The two passes have
+   mutually exclusive gates, with the normal pass being chosen wherever
+   possible.  */
 
 /* Generate a code sequence to clobber SP if speculating incorreclty.  */
 static rtx_insn *
@@ -315,11 +326,15 @@  aarch64_do_track_speculation ()
 	      needs_tracking = true;
 	    }
 
-	  if (CALL_P (insn))
+	  if (CALL_P (insn)
+	      || (NONDEBUG_INSN_P (insn)
+		  && recog_memoized (insn) >= 0
+		  && get_attr_is_call (insn) == IS_CALL_YES))
 	    {
 	      bool tailcall
-		= (SIBLING_CALL_P (insn)
-		   || find_reg_note (insn, REG_NORETURN, NULL_RTX));
+		= (CALL_P (insn)
+		   && (SIBLING_CALL_P (insn)
+		       || find_reg_note (insn, REG_NORETURN, NULL_RTX)));
 
 	      /* Tailcalls are like returns, we can eliminate the
 		 transfer between the tracker register and SP if we
@@ -461,21 +476,28 @@  const pass_data pass_data_aarch64_track_speculation =
 
 class pass_track_speculation : public rtl_opt_pass
 {
- public:
-  pass_track_speculation(gcc::context *ctxt)
-    : rtl_opt_pass(pass_data_aarch64_track_speculation, ctxt)
-    {}
+public:
+  pass_track_speculation(gcc::context *ctxt, bool is_late)
+    : rtl_opt_pass(pass_data_aarch64_track_speculation, ctxt),
+      is_late (is_late)
+  {}
 
   /* opt_pass methods:  */
   virtual bool gate (function *)
     {
-      return aarch64_track_speculation;
+      return (aarch64_track_speculation
+	      && (is_late == bool (TARGET_STREAMING_COMPATIBLE)));
     }
 
   virtual unsigned int execute (function *)
     {
       return aarch64_do_track_speculation ();
     }
+
+private:
+  /* Whether this is the late pass that runs before late prologue/epilogue
+     insertion, or the normal pass that runs before branch reorganization.  */
+  bool is_late;
 }; // class pass_track_speculation.
 } // anon namespace.
 
@@ -483,5 +505,11 @@  class pass_track_speculation : public rtl_opt_pass
 rtl_opt_pass *
 make_pass_track_speculation (gcc::context *ctxt)
 {
-  return new pass_track_speculation (ctxt);
+  return new pass_track_speculation (ctxt, /*is_late=*/false);
+}
+
+rtl_opt_pass *
+make_pass_late_track_speculation (gcc::context *ctxt)
+{
+  return new pass_track_speculation (ctxt, /*is_late=*/true);
 }
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 32eae49d4e9..76f1230d81a 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -2659,6 +2659,27 @@  aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x,
   return gen_rtx_SET (pc_rtx, x);
 }
 
+/* Return an rtx that branches to LABEL based on the value of bit BITNUM of X.
+   If CODE is NE, it branches to LABEL when the bit is set; if CODE is EQ,
+   it branches to LABEL when the bit is clear.  */
+
+static rtx
+aarch64_gen_test_and_branch (rtx_code code, rtx x, int bitnum,
+			     rtx_code_label *label)
+{
+  auto mode = GET_MODE (x);
+  if (aarch64_track_speculation)
+    {
+      auto mask = gen_int_mode (HOST_WIDE_INT_1U << bitnum, mode);
+      emit_insn (gen_aarch64_and3nr_compare0 (mode, x, mask));
+      rtx cc_reg = gen_rtx_REG (CC_NZVmode, CC_REGNUM);
+      rtx x = gen_rtx_fmt_ee (code, CC_NZVmode, cc_reg, const0_rtx);
+      return gen_condjump (x, cc_reg, label);
+    }
+  return gen_aarch64_tb (code, mode, mode,
+			 x, gen_int_mode (bitnum, mode), label);
+}
+
 /* Consider the operation:
 
      OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
@@ -4881,8 +4902,9 @@  aarch64_guard_switch_pstate_sm (rtx old_svcr, aarch64_feature_flags local_mode)
   gcc_assert (local_mode != 0);
   auto already_ok_cond = (local_mode & AARCH64_FL_SM_ON ? NE : EQ);
   auto *label = gen_label_rtx ();
-  auto *jump = emit_jump_insn (gen_aarch64_tb (already_ok_cond, DImode, DImode,
-					       old_svcr, const0_rtx, label));
+  auto branch = aarch64_gen_test_and_branch (already_ok_cond, old_svcr, 0,
+					     label);
+  auto *jump = emit_jump_insn (branch);
   JUMP_LABEL (jump) = label;
   return label;
 }
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6d39d2109e1..33fbe1b2e8d 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -439,6 +439,12 @@  (define_enum "arches" [any rcpc8_4 fp fp_q base_simd nobase_simd
 
 (define_enum_attr "arch" "arches" (const_string "any"))
 
+;; Whether a normal INSN in fact contains a call.  Sometimes we represent
+;; calls to functions that use an ad-hoc ABI as normal insns, both for
+;; optimization reasons and to avoid the need to describe the ABI to
+;; target-independent code.
+(define_attr "is_call" "no,yes" (const_string "no"))
+
 ;; [For compatibility with Arm in pipeline models]
 ;; Attribute that specifies whether or not the instruction touches fp
 ;; registers.
@@ -5395,7 +5401,7 @@  (define_insn "*ands<GPI:mode>_compare0"
   [(set_attr "type" "alus_imm")]
 )
 
-(define_insn "*and<mode>3nr_compare0"
+(define_insn "@aarch64_and<mode>3nr_compare0"
   [(set (reg:CC_NZV CC_REGNUM)
 	(compare:CC_NZV
 	 (and:GPI (match_operand:GPI 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_11.c b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_11.c
new file mode 100644
index 00000000000..ee6f98737d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_11.c
@@ -0,0 +1,209 @@ 
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls -funwind-tables -mtrack-speculation" }
+// { dg-final { check-function-bodies "**" "" } }
+
+void ns_callee ();
+ void s_callee () [[arm::streaming]];
+ void sc_callee () [[arm::streaming_compatible]];
+
+void ns_callee_stack (int, int, int, int, int, int, int, int, int);
+
+struct callbacks {
+  void (*ns_ptr) ();
+  void (*s_ptr) () [[arm::streaming]];
+  void (*sc_ptr) () [[arm::streaming_compatible]];
+};
+
+/*
+** sc_caller_sme:
+**	cmp	sp, #?0
+**	csetm	x15, ne
+**	stp	x29, x30, \[sp, #?-96\]!
+**	mov	x29, sp
+**	cntd	x16
+**	str	x16, \[sp, #?24\]
+**	stp	d8, d9, \[sp, #?32\]
+**	stp	d10, d11, \[sp, #?48\]
+**	stp	d12, d13, \[sp, #?64\]
+**	stp	d14, d15, \[sp, #?80\]
+**	mrs	x16, svcr
+**	str	x16, \[x29, #?16\]
+**	ldr	x16, \[x29, #?16\]
+**	tst	x16, #?1
+**	beq	[^\n]*
+**	csel	x15, x15, xzr, ne
+**	smstop	sm
+**	b	[^\n]*
+**	csel	x15, x15, xzr, eq
+**	mov	x14, sp
+**	and	x14, x14, x15
+**	mov	sp, x14
+**	bl	ns_callee
+**	cmp	sp, #?0
+**	csetm	x15, ne
+**	ldr	x16, \[x29, #?16\]
+**	tst	x16, #?1
+**	beq	[^\n]*
+**	csel	x15, x15, xzr, ne
+**	smstart	sm
+**	b	[^\n]*
+**	csel	x15, x15, xzr, eq
+**	ldr	x16, \[x29, #?16\]
+**	tst	x16, #?1
+**	bne	[^\n]*
+**	csel	x15, x15, xzr, eq
+**	smstart	sm
+**	b	[^\n]*
+**	csel	x15, x15, xzr, ne
+**	mov	x14, sp
+**	and	x14, x14, x15
+**	mov	sp, x14
+**	bl	s_callee
+**	cmp	sp, #?0
+**	csetm	x15, ne
+**	ldr	x16, \[x29, #?16\]
+**	tst	x16, #?1
+**	bne	[^\n]*
+**	csel	x15, x15, xzr, eq
+**	smstop	sm
+**	b	[^\n]*
+**	csel	x15, x15, xzr, ne
+**	mov	x14, sp
+**	and	x14, x14, x15
+**	mov	sp, x14
+**	bl	sc_callee
+**	cmp	sp, #?0
+**	csetm	x15, ne
+**	ldp	d8, d9, \[sp, #?32\]
+**	ldp	d10, d11, \[sp, #?48\]
+**	ldp	d12, d13, \[sp, #?64\]
+**	ldp	d14, d15, \[sp, #?80\]
+**	ldp	x29, x30, \[sp\], #?96
+**	mov	x14, sp
+**	and	x14, x14, x15
+**	mov	sp, x14
+**	ret
+*/
+void
+sc_caller_sme () [[arm::streaming_compatible]]
+{
+  ns_callee ();
+  s_callee ();
+  sc_callee ();
+}
+
+#pragma GCC target "+nosme"
+
+/*
+** sc_caller:
+**	cmp	sp, #?0
+**	csetm	x15, ne
+**	stp	x29, x30, \[sp, #?-96\]!
+**	mov	x29, sp
+**	cntd	x16
+**	str	x16, \[sp, #?24\]
+**	stp	d8, d9, \[sp, #?32\]
+**	stp	d10, d11, \[sp, #?48\]
+**	stp	d12, d13, \[sp, #?64\]
+**	stp	d14, d15, \[sp, #?80\]
+**	mov	x14, sp
+**	and	x14, x14, x15
+**	mov	sp, x14
+**	bl	__arm_sme_state
+**	cmp	sp, #?0
+**	csetm	x15, ne
+**	str	x0, \[x29, #?16\]
+**	...
+**	bl	sc_callee
+**	cmp	sp, #?0
+**	csetm	x15, ne
+**	ldp	d8, d9, \[sp, #?32\]
+**	ldp	d10, d11, \[sp, #?48\]
+**	ldp	d12, d13, \[sp, #?64\]
+**	ldp	d14, d15, \[sp, #?80\]
+**	ldp	x29, x30, \[sp\], #?96
+**	mov	x14, sp
+**	and	x14, x14, x15
+**	mov	sp, x14
+**	ret
+*/
+void
+sc_caller () [[arm::streaming_compatible]]
+{
+  ns_callee ();
+  sc_callee ();
+}
+
+/*
+** sc_caller_x0:
+**	...
+**	mov	x10, x0
+**	mov	x14, sp
+**	and	x14, x14, x15
+**	mov	sp, x14
+**	bl	__arm_sme_state
+**	...
+**	str	wzr, \[x10\]
+**	...
+*/
+void
+sc_caller_x0 (int *ptr) [[arm::streaming_compatible]]
+{
+  *ptr = 0;
+  ns_callee ();
+  sc_callee ();
+}
+
+/*
+** sc_caller_x1:
+**	...
+**	mov	x10, x0
+**	mov	x11, x1
+**	mov	x14, sp
+**	and	x14, x14, x15
+**	mov	sp, x14
+**	bl	__arm_sme_state
+**	...
+**	str	w11, \[x10\]
+**	...
+*/
+void
+sc_caller_x1 (int *ptr, int a) [[arm::streaming_compatible]]
+{
+  *ptr = a;
+  ns_callee ();
+  sc_callee ();
+}
+
+/*
+** sc_caller_stack:
+**	cmp	sp, #?0
+**	csetm	x15, ne
+**	sub	sp, sp, #112
+**	stp	x29, x30, \[sp, #?16\]
+**	add	x29, sp, #?16
+**	...
+**	stp	d8, d9, \[sp, #?48\]
+**	...
+**	bl	__arm_sme_state
+**	cmp	sp, #?0
+**	csetm	x15, ne
+**	str	x0, \[x29, #?16\]
+**	...
+**	bl	ns_callee_stack
+**	cmp	sp, #?0
+**	csetm	x15, ne
+**	ldr	x16, \[x29, #?16\]
+**	tst	x16, #?1
+**	beq	[^\n]*
+**	csel	x15, x15, xzr, ne
+**	smstart	sm
+**	...
+*/
+void
+sc_caller_stack () [[arm::streaming_compatible]]
+{
+  ns_callee_stack (0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+/* { dg-final { scan-assembler {sc_caller_sme:(?:(?!ret).)*\.cfi_offset 46, -72\n} } } */
+/* { dg-final { scan-assembler {sc_caller:(?:(?!ret).)*\.cfi_offset 46, -72\n} } } */