@@ -7898,6 +7898,35 @@ static const code_for_push_pop_t code_for_push_pop[ZCMP_MAX_GRP_SLOTS][ZCMP_OP_N
code_for_gpr_multi_popret_up_to_s11,
code_for_gpr_multi_popretz_up_to_s11}};
+/* Set a probe loop for stack clash protection. */
+static void
+riscv_allocate_and_probe_stack_loop (rtx tmp, enum rtx_code code,
+ rtx op0, rtx op1, bool vector,
+ HOST_WIDE_INT offset)
+{
+ tmp = riscv_force_temporary (tmp, gen_int_mode (offset, Pmode));
+
+ /* Loop. */
+ rtx label = gen_label_rtx ();
+ emit_label (label);
+
+ /* Allocate and probe stack. */
+ emit_insn (gen_sub3_insn (stack_pointer_rtx, stack_pointer_rtx, tmp));
+ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+ STACK_CLASH_CALLER_GUARD));
+ emit_insn (gen_blockage ());
+
+ /* Adjust the remaining vector length. */
+ if (vector)
+ emit_insn (gen_sub3_insn (op0, op0, tmp));
+
+ /* Branch if there's still more bytes to probe. */
+ riscv_expand_conditional_branch (label, code, op0, op1);
+ JUMP_LABEL (get_last_insn ()) = label;
+
+ emit_insn (gen_blockage ());
+}
+
/* Adjust scalable frame of vector for prologue && epilogue. */
static void
@@ -7910,6 +7939,49 @@ riscv_v_adjust_scalable_frame (rtx target, poly_int64 offset, bool epilogue)
riscv_legitimize_poly_move (Pmode, adjust_size, tmp,
gen_int_mode (offset, Pmode));
+ /* If doing stack clash protection then we use a loop to allocate and probe
+ the stack. */
+ if (flag_stack_clash_protection && !epilogue)
+ {
+ HOST_WIDE_INT min_probe_threshold
+ = (1 << param_stack_clash_protection_guard_size) - STACK_CLASH_CALLER_GUARD;
+
+ if (!frame_pointer_needed)
+ {
+ /* This is done to provide unwinding information for the stack
+ adjustments we're about to do, however to prevent the optimizers
+ from removing the T3 move and leaving the CFA note (which would be
+ very wrong) we tie the old and new stack pointer together.
+ The tie will expand to nothing but the optimizers will not touch
+ the instruction. */
+ insn = get_last_insn ();
+ rtx stack_ptr_copy = gen_rtx_REG (Pmode, RISCV_STACK_CLASH_VECTOR_CFA_REGNUM);
+ emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
+ riscv_emit_stack_tie (stack_ptr_copy);
+
+ /* We want the CFA independent of the stack pointer for the
+ duration of the loop. */
+ add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
+
+ riscv_allocate_and_probe_stack_loop (tmp, GE, adjust_size, tmp, true,
+ min_probe_threshold);
+
+ /* Allocate the residual. */
+ insn = emit_insn (gen_sub3_insn (target, target, adjust_size));
+
+ /* Now reset the CFA register if needed. */
+ if (!frame_pointer_needed)
+ {
+ add_reg_note (insn, REG_CFA_DEF_CFA,
+ plus_constant (Pmode, stack_pointer_rtx, -offset));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
+
+ return;
+ }
+
if (epilogue)
insn = gen_add3_insn (target, target, adjust_size);
else
@@ -8057,8 +8129,9 @@ riscv_allocate_and_probe_stack_space (rtx temp1, HOST_WIDE_INT size)
else
{
/* Compute the ending address. */
- temp1 = riscv_force_temporary (temp1, gen_int_mode (rounded_size, Pmode));
- insn = emit_insn (gen_sub3_insn (temp1, stack_pointer_rtx, temp1));
+ rtx temp2 = gen_rtx_REG (Pmode, RISCV_PROLOGUE_TEMP2_REGNUM);
+ temp2 = riscv_force_temporary (temp2, gen_int_mode (rounded_size, Pmode));
+ insn = emit_insn (gen_sub3_insn (temp2, stack_pointer_rtx, temp2));
if (!frame_pointer_needed)
{
@@ -8069,25 +8142,9 @@ riscv_allocate_and_probe_stack_space (rtx temp1, HOST_WIDE_INT size)
RTX_FRAME_RELATED_P (insn) = 1;
}
- /* Allocate and probe the stack. */
-
- rtx temp2 = gen_rtx_REG (Pmode, RISCV_PROLOGUE_TEMP2_REGNUM);
- temp2 = riscv_force_temporary (temp2, gen_int_mode (guard_size, Pmode));
-
- /* Loop. */
- rtx label = gen_label_rtx ();
- emit_label (label);
-
- emit_insn (gen_sub3_insn (stack_pointer_rtx, stack_pointer_rtx, temp2));
- emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
- guard_used_by_caller));
- emit_insn (gen_blockage ());
-
- /* Check if the stack pointer is at the ending address. */
- riscv_expand_conditional_branch (label, NE, stack_pointer_rtx, temp1);
- JUMP_LABEL (get_last_insn ()) = label;
-
- emit_insn (gen_blockage ());
+ /* This allocates and probes the stack. */
+ riscv_allocate_and_probe_stack_loop (temp1, NE, stack_pointer_rtx, temp2,
+ false, guard_size);
/* Now reset the CFA register if needed. */
if (!frame_pointer_needed)
@@ -429,6 +429,11 @@ ASM_MISA_SPEC
#define RISCV_PROLOGUE_TEMP2_REGNUM (GP_TEMP_FIRST + 1)
#define RISCV_PROLOGUE_TEMP2(MODE) gen_rtx_REG (MODE, RISCV_PROLOGUE_TEMP2_REGNUM)
+/* Both prologue temp registers are used in the vector probe loop for when
+ stack-clash protection is enabled, so we need to copy SP to a new register
+ and set it as CFA during the loop, we are using T3 for that. */
+#define RISCV_STACK_CLASH_VECTOR_CFA_REGNUM (GP_TEMP_FIRST + 23)
+
#define RISCV_CALL_ADDRESS_TEMP_REGNUM (GP_TEMP_FIRST + 1)
#define RISCV_CALL_ADDRESS_TEMP(MODE) \
gen_rtx_REG (MODE, RISCV_CALL_ADDRESS_TEMP_REGNUM)
new file mode 100644
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -fstack-clash-protection -funwind-tables -fno-stack-protector" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+#include "stack-check-prologue-16.c"
+
+/* Checks that the CFA notes are correct for every sp adjustment, but we also
+ need to make sure we can unwind correctly before the frame is set up. So
+ check that we're emitting t3 with a copy of sp an setting the CFA there. */
+
+/* { dg-final { scan-assembler-times {mv\tt3,sp} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_def_cfa [0-9]+, 0} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xa,0x72,0,0x92,0xa2,0x38,0,0x9,0xec,0x1e,0x22} 1 } } */
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -fstack-clash-protection" } */
+
+/* Invoke X (P##n) for n in [0, 7]. */
+#define REPEAT8(X, P) \
+ X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7)
+
+/* Invoke X (n) for all octal n in [0, 39]. */
+#define REPEAT40(X) \
+ REPEAT8 (X, 0) REPEAT8 (X, 1) REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
+
+/* Expect vector work to be done, with spilling of vector registers. */
+void
+f2 (int x[40][100], int *y)
+{
+ /* Try to force some spilling. */
+#define DECLARE(N) int y##N = y[N];
+ REPEAT40 (DECLARE);
+#pragma omp simd
+ for (int i = 0; i < 100; ++i)
+ {
+#define INC(N) x[N][i] += y##N;
+ REPEAT40 (INC);
+ }
+}
+
+/* Vector spill, requires probing as vector size is unknown at compile time. */
+
+/* { dg-final { scan-assembler-times {sd\tzero,1024\(sp\)} 1 } } */
new file mode 100644
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -fstack-clash-protection -fno-stack-protector" } */
+/* { dg-skip-if "" { *-*-* } {"-O0" "-O1" "-O2" "-Og" "-Os" "-Oz" "-funroll-loops"} } */
+
+#include <stdint.h>
+
+#define N 50
+#define S 2 * 4 * 1024
+
+/* Invoke X (P##n) for n in [0, 9]. */
+#define REPEAT8(X, P) \
+ X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7) \
+ X (P##8) X (P##9)
+
+/* Invoke X (n) for all n in [0, 49]. */
+#define REPEAT50(X) \
+ REPEAT8 (X, ) REPEAT8 (X, 1) REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
+
+ /* Try to force some spilling. */
+#define DECLARE(N) int src##N = src[N * 4];
+#define INC(N) dest[i] += src##N;
+
+#define TEST_LOOP(NAME, TYPE) \
+ void __attribute__ ((noinline)) \
+ NAME (TYPE *restrict dest, TYPE *restrict src) \
+ { \
+ REPEAT50 (DECLARE); \
+ volatile char foo[S]; \
+ foo[S-1]=1; \
+ for (int i = 0; i < N; i++) \
+ { \
+ REPEAT50 (INC); \
+ } \
+ }
+
+#define TEST(NAME) \
+ TEST_LOOP (NAME##_i32, int32_t) \
+ TEST_LOOP (NAME##_i64, int64_t)
+
+TEST (test)
+
+/* Check the vectorized loop for stack clash probing. */
+
+/* { dg-final { scan-assembler-times {sd\tzero,1024\(sp\)} 6 } } */
+/* { dg-final { scan-assembler-times {bge\tt1,t0,.[^\\r\\n]*} 2 } } */
+/* { dg-final { scan-assembler-times {sub\s+t1,t1,t0} 2 } } */