[v2] RISC-V: Add support to vector stack-clash protection

Message ID 20240801120126.4282-1-rzinsly@ventanamicro.com
State Committed
Commit 2862d99bfdae96a1d4b275fa3f3daad6206ff761
Delegated to: Jeff Law
Headers
Series [v2] RISC-V: Add support to vector stack-clash protection |

Checks

Context Check Description
rivoscibot/toolchain-ci-rivos-lint warning Lint failed
rivoscibot/toolchain-ci-rivos-apply-patch fail Patch failed to apply
linaro-tcwg-bot/tcwg_gcc_build--master-arm fail Patch failed to apply
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 fail Patch failed to apply

Commit Message

Raphael Moreira Zinsly Aug. 1, 2024, 12:01 p.m. UTC
  Changes since v1:
	- Changed RISCV_STACK_CLASH_VECTOR_CFA_REGNUM to t3.

-- >8 --

Adds basic support to vector stack-clash protection using a loop to do
the probing and stack adjustments.

gcc/ChangeLog:
	* config/riscv/riscv.cc
	(riscv_allocate_and_probe_stack_loop): New function.
	(riscv_v_adjust_scalable_frame): Add stack-clash protection
	support.
	(riscv_allocate_and_probe_stack_space): Move the probe loop
	implementation to riscv_allocate_and_probe_stack_loop.
	* config/riscv/riscv.h: Define RISCV_STACK_CLASH_VECTOR_CFA_REGNUM.

gcc/testsuite/ChangeLog:
	* gcc.target/riscv/stack-check-cfa-3.c: New test.
	* gcc.target/riscv/stack-check-prologue-16.c: New test.
	* gcc.target/riscv/struct_vect_24.c: New test.
---
 gcc/config/riscv/riscv.cc                     | 99 +++++++++++++++----
 gcc/config/riscv/riscv.h                      |  5 +
 .../gcc.target/riscv/stack-check-cfa-3.c      | 13 +++
 .../riscv/stack-check-prologue-16.c           | 30 ++++++
 .../gcc.target/riscv/struct_vect_24.c         | 47 +++++++++
 5 files changed, 173 insertions(+), 21 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/stack-check-cfa-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/stack-check-prologue-16.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/struct_vect_24.c
  

Comments

Jeff Law Aug. 1, 2024, 6:40 p.m. UTC | #1
On 8/1/24 6:01 AM, Raphael Moreira Zinsly wrote:

>   
> +/* Both prologue temp registers are used in the vector probe loop for when
> +   stack-clash protection is enabled, so we need to copy SP to a new register
> +   and set it as CFA during the loop, we are using T3 for that.  */
> +#define RISCV_STACK_CLASH_VECTOR_CFA_REGNUM (GP_TEMP_FIRST + 23)
"23" looks like a typo.  Shouldn't it be "3"?

Jeff
  
Raphael Moreira Zinsly Aug. 1, 2024, 8:16 p.m. UTC | #2
On Thu, Aug 1, 2024 at 3:40 PM Jeff Law <jeffreyalaw@gmail.com> wrote:
> On 8/1/24 6:01 AM, Raphael Moreira Zinsly wrote:
> > +/* Both prologue temp registers are used in the vector probe loop for when
> > +   stack-clash protection is enabled, so we need to copy SP to a new register
> > +   and set it as CFA during the loop, we are using T3 for that.  */
> > +#define RISCV_STACK_CLASH_VECTOR_CFA_REGNUM (GP_TEMP_FIRST + 23)
> "23" looks like a typo.  Shouldn't it be "3"?

GP_TEMP_FIRST + 3 = 8, which is s0/fp.
t3 is register 28.


--
Raphael Moreira Zinsly
  
Jeff Law Aug. 5, 2024, 2:19 p.m. UTC | #3
On 8/1/24 2:16 PM, Raphael Zinsly wrote:
> On Thu, Aug 1, 2024 at 3:40 PM Jeff Law <jeffreyalaw@gmail.com> wrote:
>> On 8/1/24 6:01 AM, Raphael Moreira Zinsly wrote:
>>> +/* Both prologue temp registers are used in the vector probe loop for when
>>> +   stack-clash protection is enabled, so we need to copy SP to a new register
>>> +   and set it as CFA during the loop, we are using T3 for that.  */
>>> +#define RISCV_STACK_CLASH_VECTOR_CFA_REGNUM (GP_TEMP_FIRST + 23)
>> "23" looks like a typo.  Shouldn't it be "3"?
> 
> GP_TEMP_FIRST + 3 = 8, which is s0/fp.
> t3 is register 28.
I'd forgotten the temps are a disjoint set, sorry about goofing that up.

The series is OK for the trunk.  IT's been a long road....


jeff
  

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index b3534ee0b92..ccbb7ea1324 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7898,6 +7898,35 @@  static const code_for_push_pop_t code_for_push_pop[ZCMP_MAX_GRP_SLOTS][ZCMP_OP_N
       code_for_gpr_multi_popret_up_to_s11,
       code_for_gpr_multi_popretz_up_to_s11}};
 
+/*  Set a probe loop for stack clash protection.  */
+static void
+riscv_allocate_and_probe_stack_loop (rtx tmp, enum rtx_code code,
+				     rtx op0, rtx op1, bool vector,
+				     HOST_WIDE_INT offset)
+{
+  tmp = riscv_force_temporary (tmp, gen_int_mode (offset, Pmode));
+
+  /* Loop.  */
+  rtx label = gen_label_rtx ();
+  emit_label (label);
+
+  /* Allocate and probe stack.  */
+  emit_insn (gen_sub3_insn (stack_pointer_rtx, stack_pointer_rtx, tmp));
+  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+		    STACK_CLASH_CALLER_GUARD));
+  emit_insn (gen_blockage ());
+
+  /* Adjust the remaining vector length.  */
+  if (vector)
+    emit_insn (gen_sub3_insn (op0, op0, tmp));
+
+  /* Branch if there's still more bytes to probe.  */
+  riscv_expand_conditional_branch (label, code, op0, op1);
+  JUMP_LABEL (get_last_insn ()) = label;
+
+  emit_insn (gen_blockage ());
+}
+
 /* Adjust scalable frame of vector for prologue && epilogue. */
 
 static void
@@ -7910,6 +7939,49 @@  riscv_v_adjust_scalable_frame (rtx target, poly_int64 offset, bool epilogue)
   riscv_legitimize_poly_move (Pmode, adjust_size, tmp,
 			      gen_int_mode (offset, Pmode));
 
+  /* If doing stack clash protection then we use a loop to allocate and probe
+     the stack.  */
+  if (flag_stack_clash_protection && !epilogue)
+    {
+      HOST_WIDE_INT min_probe_threshold
+	= (1 << param_stack_clash_protection_guard_size) - STACK_CLASH_CALLER_GUARD;
+
+      if (!frame_pointer_needed)
+	{
+	  /* This is done to provide unwinding information for the stack
+	     adjustments we're about to do, however to prevent the optimizers
+	     from removing the T3 move and leaving the CFA note (which would be
+	     very wrong) we tie the old and new stack pointer together.
+	     The tie will expand to nothing but the optimizers will not touch
+	     the instruction.  */
+	  insn = get_last_insn ();
+	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, RISCV_STACK_CLASH_VECTOR_CFA_REGNUM);
+	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
+	  riscv_emit_stack_tie (stack_ptr_copy);
+
+	  /* We want the CFA independent of the stack pointer for the
+	     duration of the loop.  */
+	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      riscv_allocate_and_probe_stack_loop (tmp, GE, adjust_size, tmp, true,
+					   min_probe_threshold);
+
+      /* Allocate the residual.  */
+      insn = emit_insn (gen_sub3_insn (target, target, adjust_size));
+
+      /* Now reset the CFA register if needed.  */
+      if (!frame_pointer_needed)
+	{
+	  add_reg_note (insn, REG_CFA_DEF_CFA,
+			plus_constant (Pmode, stack_pointer_rtx, -offset));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      return;
+    }
+
   if (epilogue)
     insn = gen_add3_insn (target, target, adjust_size);
   else
@@ -8057,8 +8129,9 @@  riscv_allocate_and_probe_stack_space (rtx temp1, HOST_WIDE_INT size)
   else
     {
       /* Compute the ending address.  */
-      temp1 = riscv_force_temporary (temp1, gen_int_mode (rounded_size, Pmode));
-      insn = emit_insn (gen_sub3_insn (temp1, stack_pointer_rtx, temp1));
+      rtx temp2 = gen_rtx_REG (Pmode, RISCV_PROLOGUE_TEMP2_REGNUM);
+      temp2 = riscv_force_temporary (temp2, gen_int_mode (rounded_size, Pmode));
+      insn = emit_insn (gen_sub3_insn (temp2, stack_pointer_rtx, temp2));
 
       if (!frame_pointer_needed)
 	{
@@ -8069,25 +8142,9 @@  riscv_allocate_and_probe_stack_space (rtx temp1, HOST_WIDE_INT size)
 	  RTX_FRAME_RELATED_P (insn) = 1;
 	}
 
-      /* Allocate and probe the stack.  */
-
-      rtx temp2 = gen_rtx_REG (Pmode, RISCV_PROLOGUE_TEMP2_REGNUM);
-      temp2 = riscv_force_temporary (temp2, gen_int_mode (guard_size, Pmode));
-
-      /* Loop.  */
-      rtx label = gen_label_rtx ();
-      emit_label (label);
-
-      emit_insn (gen_sub3_insn (stack_pointer_rtx, stack_pointer_rtx, temp2));
-      emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
-			   guard_used_by_caller));
-      emit_insn (gen_blockage ());
-
-      /* Check if the stack pointer is at the ending address.  */
-      riscv_expand_conditional_branch (label, NE, stack_pointer_rtx, temp1);
-      JUMP_LABEL (get_last_insn ()) = label;
-
-      emit_insn (gen_blockage ());
+      /* This allocates and probes the stack.  */
+      riscv_allocate_and_probe_stack_loop (temp1, NE, stack_pointer_rtx, temp2,
+					   false, guard_size);
 
       /* Now reset the CFA register if needed.  */
       if (!frame_pointer_needed)
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index 9670c7df8f7..35a53982ae9 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -429,6 +429,11 @@  ASM_MISA_SPEC
 #define RISCV_PROLOGUE_TEMP2_REGNUM (GP_TEMP_FIRST + 1)
 #define RISCV_PROLOGUE_TEMP2(MODE) gen_rtx_REG (MODE, RISCV_PROLOGUE_TEMP2_REGNUM)
 
+/* Both prologue temp registers are used in the vector probe loop for when
+   stack-clash protection is enabled, so we need to copy SP to a new register
+   and set it as CFA during the loop, we are using T3 for that.  */
+#define RISCV_STACK_CLASH_VECTOR_CFA_REGNUM (GP_TEMP_FIRST + 23)
+
 #define RISCV_CALL_ADDRESS_TEMP_REGNUM (GP_TEMP_FIRST + 1)
 #define RISCV_CALL_ADDRESS_TEMP(MODE) \
   gen_rtx_REG (MODE, RISCV_CALL_ADDRESS_TEMP_REGNUM)
diff --git a/gcc/testsuite/gcc.target/riscv/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/riscv/stack-check-cfa-3.c
new file mode 100644
index 00000000000..e45f7bb7df5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/stack-check-cfa-3.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -fstack-clash-protection -funwind-tables -fno-stack-protector" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+#include "stack-check-prologue-16.c"
+
+/* Checks that the CFA notes are correct for every sp adjustment, but we also
+   need to make sure we can unwind correctly before the frame is set up.  So
+   check that we're emitting t3 with a copy of sp an setting the CFA there.  */
+
+/* { dg-final { scan-assembler-times {mv\tt3,sp} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_def_cfa [0-9]+, 0} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xa,0x72,0,0x92,0xa2,0x38,0,0x9,0xec,0x1e,0x22} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/stack-check-prologue-16.c b/gcc/testsuite/gcc.target/riscv/stack-check-prologue-16.c
new file mode 100644
index 00000000000..c74dce04b23
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/stack-check-prologue-16.c
@@ -0,0 +1,30 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -fstack-clash-protection" } */
+
+/* Invoke X (P##n) for n in [0, 7].  */
+#define REPEAT8(X, P) \
+  X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7)
+
+/* Invoke X (n) for all octal n in [0, 39].  */
+#define REPEAT40(X) \
+  REPEAT8 (X, 0) REPEAT8 (X, 1)  REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
+
+/* Expect vector work to be done, with spilling of vector registers.  */
+void
+f2 (int x[40][100], int *y)
+{
+  /* Try to force some spilling.  */
+#define DECLARE(N) int y##N = y[N];
+  REPEAT40 (DECLARE);
+#pragma omp simd
+  for (int i = 0; i < 100; ++i)
+    {
+#define INC(N) x[N][i] += y##N;
+      REPEAT40 (INC);
+    }
+}
+
+/* Vector spill, requires probing as vector size is unknown at compile time.  */
+
+/* { dg-final { scan-assembler-times {sd\tzero,1024\(sp\)} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/struct_vect_24.c b/gcc/testsuite/gcc.target/riscv/struct_vect_24.c
new file mode 100644
index 00000000000..7c0852f1a55
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/struct_vect_24.c
@@ -0,0 +1,47 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -fstack-clash-protection -fno-stack-protector" } */
+/* { dg-skip-if "" { *-*-* } {"-O0" "-O1" "-O2" "-Og" "-Os" "-Oz" "-funroll-loops"} } */
+
+#include <stdint.h>
+
+#define N 50
+#define S 2 * 4 * 1024
+
+/* Invoke X (P##n) for n in [0, 9].  */
+#define REPEAT8(X, P) \
+  X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7) \
+  X (P##8)  X (P##9)
+
+/* Invoke X (n) for all n in [0, 49].  */
+#define REPEAT50(X) \
+  REPEAT8 (X, ) REPEAT8 (X, 1)  REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
+
+  /* Try to force some spilling.  */
+#define DECLARE(N) int src##N = src[N * 4];
+#define INC(N) dest[i] += src##N;
+
+#define TEST_LOOP(NAME, TYPE)				\
+  void __attribute__ ((noinline))	\
+  NAME (TYPE *restrict dest, TYPE *restrict src)	\
+  {							\
+    REPEAT50 (DECLARE);					\
+    volatile char foo[S];				\
+    foo[S-1]=1;						\
+    for (int i = 0; i < N; i++)				\
+      {							\
+	REPEAT50 (INC);					\
+      }							\
+  }
+
+#define TEST(NAME) \
+  TEST_LOOP (NAME##_i32, int32_t) \
+  TEST_LOOP (NAME##_i64, int64_t)
+
+TEST (test)
+
+/* Check the vectorized loop for stack clash probing.  */
+
+/* { dg-final { scan-assembler-times {sd\tzero,1024\(sp\)} 6 } } */
+/* { dg-final { scan-assembler-times {bge\tt1,t0,.[^\\r\\n]*} 2 } } */
+/* { dg-final { scan-assembler-times {sub\s+t1,t1,t0} 2 } } */