[09/22] aarch64: Add GCS support for nonlocal stack save
Commit Message
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Nonlocal stack save and restore has to also save and restore the GCS
pointer. This is used in __builtin_setjmp/longjmp and nonlocal goto.
The GCS specific code is only emitted if GCS branch-protection is
enabled and the code always checks at runtime if GCS is enabled.
The new -mbranch-protection=gcs and old -mbranch-protection=none code
are ABI compatible: jmpbuf for __builtin_setjmp has space for 5
pointers, the layout is
old layout: fp, pc, sp, unused, unused
new layout: fp, pc, sp, gcsp, unused
Note: the ILP32 code generation is wrong as it saves the pointers with
Pmode (i.e. 8 bytes per pointer), but the user supplied buffer size is
for 5 pointers (4 bytes per pointer), this is not fixed.
The nonlocal goto has no ABI compatibility issues as the goto and its
destination are in the same translation unit.
gcc/ChangeLog:
* config/aarch64/aarch64.h (STACK_SAVEAREA_MODE): Make space for gcs.
* config/aarch64/aarch64.md (save_stack_nonlocal): New.
(restore_stack_nonlocal): New.
---
gcc/config/aarch64/aarch64.h | 7 +++
gcc/config/aarch64/aarch64.md | 82 +++++++++++++++++++++++++++++++++++
2 files changed, 89 insertions(+)
Comments
Yury Khrustalev <yury.khrustalev@arm.com> writes:
> From: Szabolcs Nagy <szabolcs.nagy@arm.com>
>
> Nonlocal stack save and restore has to also save and restore the GCS
> pointer. This is used in __builtin_setjmp/longjmp and nonlocal goto.
>
> The GCS specific code is only emitted if GCS branch-protection is
> enabled and the code always checks at runtime if GCS is enabled.
>
> The new -mbranch-protection=gcs and old -mbranch-protection=none code
> are ABI compatible: jmpbuf for __builtin_setjmp has space for 5
> pointers, the layout is
>
> old layout: fp, pc, sp, unused, unused
> new layout: fp, pc, sp, gcsp, unused
>
> Note: the ILP32 code generation is wrong as it saves the pointers with
> Pmode (i.e. 8 bytes per pointer), but the user supplied buffer size is
> for 5 pointers (4 bytes per pointer), this is not fixed.
>
> The nonlocal goto has no ABI compatibility issues as the goto and its
> destination are in the same translation unit.
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64.h (STACK_SAVEAREA_MODE): Make space for gcs.
> * config/aarch64/aarch64.md (save_stack_nonlocal): New.
> (restore_stack_nonlocal): New.
> ---
> gcc/config/aarch64/aarch64.h | 7 +++
> gcc/config/aarch64/aarch64.md | 82 +++++++++++++++++++++++++++++++++++
> 2 files changed, 89 insertions(+)
>
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 593319fd472..43a92e85780 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -1297,6 +1297,13 @@ typedef struct
> #define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
> ((VALUE) = GET_MODE_UNIT_BITSIZE (MODE), 2)
>
> +/* Have space for both SP and GCSPR in the NONLOCAL case in
> + emit_stack_save as well as in __builtin_setjmp, __builtin_longjmp
> + and __builtin_nonlocal_goto.
> + Note: On ILP32 the documented buf size is not enough PR84150. */
> +#define STACK_SAVEAREA_MODE(LEVEL) \
> + ((LEVEL) == SAVE_NONLOCAL ? TImode : Pmode)
It might be better to use CDImode, so that we don't claim 16-byte alignment
for -mstrict-align.
> +
> #define INCOMING_RETURN_ADDR_RTX gen_rtx_REG (Pmode, LR_REGNUM)
>
> #define RETURN_ADDR_RTX aarch64_return_addr
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index e4e11e35b5b..6e1646387d8 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1200,6 +1200,88 @@ (define_insn "*cb<optab><mode>1"
> (const_int 1)))]
> )
>
> +(define_expand "save_stack_nonlocal"
> + [(set (match_operand 0 "memory_operand")
> + (match_operand 1 "register_operand"))]
> + ""
> +{
> + rtx stack_slot = adjust_address (operands[0], Pmode, 0);
> + emit_move_insn (stack_slot, operands[1]);
> +
> + if (aarch64_gcs_enabled ())
> + {
> + /* Save GCS with code like
> + mov x16, 1
> + chkfeat x16
> + tbnz x16, 0, .L_done
> + mrs tmp, gcspr_el0
> + str tmp, [%0, 8]
> + .L_done: */
> +
> + rtx done_label = gen_label_rtx ();
> + rtx r16 = gen_rtx_REG (DImode, R16_REGNUM);
> + emit_move_insn (r16, const1_rtx);
> + emit_insn (gen_aarch64_chkfeat ());
> + emit_insn (gen_tbranch_neqi3 (r16, const0_rtx, done_label));
> + rtx gcs_slot = adjust_address (operands[0], Pmode, GET_MODE_SIZE (Pmode));
> + rtx gcs = force_reg (Pmode, const0_rtx);
The code seems to use force_reg (Pmode, const0_rtx) to get a fresh
register, but that should be done using gen_reg_rtx (Pmode) instead.
Looks good otherwise. In particular, it avoids one mistake I made
in the past, in that it uses the generic optabs to generate branches,
and so should work with -mtrack-speculation. (It would be good to have
a test of nonlocal goto and -mtrack-speculation though, if the later
patches don't have one already.)
Thanks,
Richard
> + emit_insn (gen_aarch64_load_gcspr (gcs));
> + emit_move_insn (gcs_slot, gcs);
> + emit_label (done_label);
> + }
> + DONE;
> +})
> +
> +(define_expand "restore_stack_nonlocal"
> + [(set (match_operand 0 "register_operand" "")
> + (match_operand 1 "memory_operand" ""))]
> + ""
> +{
> + rtx stack_slot = adjust_address (operands[1], Pmode, 0);
> + emit_move_insn (operands[0], stack_slot);
> +
> + if (aarch64_gcs_enabled ())
> + {
> + /* Restore GCS with code like
> + mov x16, 1
> + chkfeat x16
> + tbnz x16, 0, .L_done
> + ldr tmp1, [%1, 8]
> + mrs tmp2, gcspr_el0
> + subs tmp2, tmp1, tmp2
> + b.eq .L_done
> + .L_loop:
> + gcspopm
> + subs tmp2, tmp2, 8
> + b.ne .L_loop
> + .L_done: */
> +
> + rtx loop_label = gen_label_rtx ();
> + rtx done_label = gen_label_rtx ();
> + rtx r16 = gen_rtx_REG (DImode, R16_REGNUM);
> + emit_move_insn (r16, const1_rtx);
> + emit_insn (gen_aarch64_chkfeat ());
> + emit_insn (gen_tbranch_neqi3 (r16, const0_rtx, done_label));
> + rtx gcs_slot = adjust_address (operands[1], Pmode, GET_MODE_SIZE (Pmode));
> + rtx gcs_old = force_reg (Pmode, const0_rtx);
> + emit_move_insn (gcs_old, gcs_slot);
> + rtx gcs_now = force_reg (Pmode, const0_rtx);
> + emit_insn (gen_aarch64_load_gcspr (gcs_now));
> + emit_insn (gen_subdi3_compare1 (gcs_now, gcs_old, gcs_now));
> + rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
> + rtx cmp_rtx = gen_rtx_fmt_ee (EQ, DImode, cc_reg, const0_rtx);
> + emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, done_label));
> + emit_label (loop_label);
> + emit_insn (gen_aarch64_gcspopm_xzr ());
> + emit_insn (gen_adddi3_compare0 (gcs_now, gcs_now, GEN_INT (-8)));
> + cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
> + cmp_rtx = gen_rtx_fmt_ee (NE, DImode, cc_reg, const0_rtx);
> + emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, loop_label));
> + emit_label (done_label);
> + }
> + DONE;
> +})
> +
> ;; -------------------------------------------------------------------
> ;; Subroutine calls and sibcalls
> ;; -------------------------------------------------------------------
@@ -1297,6 +1297,13 @@ typedef struct
#define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
((VALUE) = GET_MODE_UNIT_BITSIZE (MODE), 2)
+/* Have space for both SP and GCSPR in the NONLOCAL case in
+ emit_stack_save as well as in __builtin_setjmp, __builtin_longjmp
+ and __builtin_nonlocal_goto.
+ Note: On ILP32 the documented buf size is not enough PR84150. */
+#define STACK_SAVEAREA_MODE(LEVEL) \
+ ((LEVEL) == SAVE_NONLOCAL ? TImode : Pmode)
+
#define INCOMING_RETURN_ADDR_RTX gen_rtx_REG (Pmode, LR_REGNUM)
#define RETURN_ADDR_RTX aarch64_return_addr
@@ -1200,6 +1200,88 @@ (define_insn "*cb<optab><mode>1"
(const_int 1)))]
)
+(define_expand "save_stack_nonlocal"
+ [(set (match_operand 0 "memory_operand")
+ (match_operand 1 "register_operand"))]
+ ""
+{
+ rtx stack_slot = adjust_address (operands[0], Pmode, 0);
+ emit_move_insn (stack_slot, operands[1]);
+
+ if (aarch64_gcs_enabled ())
+ {
+ /* Save GCS with code like
+ mov x16, 1
+ chkfeat x16
+ tbnz x16, 0, .L_done
+ mrs tmp, gcspr_el0
+ str tmp, [%0, 8]
+ .L_done: */
+
+ rtx done_label = gen_label_rtx ();
+ rtx r16 = gen_rtx_REG (DImode, R16_REGNUM);
+ emit_move_insn (r16, const1_rtx);
+ emit_insn (gen_aarch64_chkfeat ());
+ emit_insn (gen_tbranch_neqi3 (r16, const0_rtx, done_label));
+ rtx gcs_slot = adjust_address (operands[0], Pmode, GET_MODE_SIZE (Pmode));
+ rtx gcs = force_reg (Pmode, const0_rtx);
+ emit_insn (gen_aarch64_load_gcspr (gcs));
+ emit_move_insn (gcs_slot, gcs);
+ emit_label (done_label);
+ }
+ DONE;
+})
+
+(define_expand "restore_stack_nonlocal"
+ [(set (match_operand 0 "register_operand" "")
+ (match_operand 1 "memory_operand" ""))]
+ ""
+{
+ rtx stack_slot = adjust_address (operands[1], Pmode, 0);
+ emit_move_insn (operands[0], stack_slot);
+
+ if (aarch64_gcs_enabled ())
+ {
+ /* Restore GCS with code like
+ mov x16, 1
+ chkfeat x16
+ tbnz x16, 0, .L_done
+ ldr tmp1, [%1, 8]
+ mrs tmp2, gcspr_el0
+ subs tmp2, tmp1, tmp2
+ b.eq .L_done
+ .L_loop:
+ gcspopm
+ subs tmp2, tmp2, 8
+ b.ne .L_loop
+ .L_done: */
+
+ rtx loop_label = gen_label_rtx ();
+ rtx done_label = gen_label_rtx ();
+ rtx r16 = gen_rtx_REG (DImode, R16_REGNUM);
+ emit_move_insn (r16, const1_rtx);
+ emit_insn (gen_aarch64_chkfeat ());
+ emit_insn (gen_tbranch_neqi3 (r16, const0_rtx, done_label));
+ rtx gcs_slot = adjust_address (operands[1], Pmode, GET_MODE_SIZE (Pmode));
+ rtx gcs_old = force_reg (Pmode, const0_rtx);
+ emit_move_insn (gcs_old, gcs_slot);
+ rtx gcs_now = force_reg (Pmode, const0_rtx);
+ emit_insn (gen_aarch64_load_gcspr (gcs_now));
+ emit_insn (gen_subdi3_compare1 (gcs_now, gcs_old, gcs_now));
+ rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
+ rtx cmp_rtx = gen_rtx_fmt_ee (EQ, DImode, cc_reg, const0_rtx);
+ emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, done_label));
+ emit_label (loop_label);
+ emit_insn (gen_aarch64_gcspopm_xzr ());
+ emit_insn (gen_adddi3_compare0 (gcs_now, gcs_now, GEN_INT (-8)));
+ cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
+ cmp_rtx = gen_rtx_fmt_ee (NE, DImode, cc_reg, const0_rtx);
+ emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, loop_label));
+ emit_label (done_label);
+ }
+ DONE;
+})
+
;; -------------------------------------------------------------------
;; Subroutine calls and sibcalls
;; -------------------------------------------------------------------