| Message ID | 20230912172439.2336327-1-ross.burton@arm.com |
|---|---|
| State | New |
| Headers | show |
| Series | [mickledore] gcc: Fix -fstack-protector issue on aarch64 | expand |
FYI: one of LGE proprietary components triggers ICE with this applied, I'll
try to find minimal reproducer later, this is just for other people who
might hit the same:
error: unrecognizable insn:
2923 | }
| ^
(insn 416 286 290 17 (parallel [
(set (mem/c:SI (plus:DI (reg/f:DI 29 x29)
(const_int -260 [0xfffffffffffffefc])) [1
redacted.pixel_format+0 S4 A32])
(const_int 0 [0]))
(set (mem/c:SI (plus:DI (reg/f:DI 29 x29)
(const_int -256 [0xffffffffffffff00])) [1
redacted.pixel_value+0 S4 A128])
(reg/v:SI 22 x22 [orig:141 color ] [141]))
])
"TOPDIR/BUILD/work/redacted/redacted/redacted/git/redacted.c":2903:45 -1
(expr_list:REG_DEAD (reg/v:SI 22 x22 [orig:141 color ] [141])
(nil)))
during RTL pass: cprop_hardreg
TOPDIR/BUILD/work/redacted/redacted/redacted/git/redacted.c:2923:1:
internal compiler error: in extract_insn, at recog.cc:2791
0x191624a internal_error(char const*, ...)
???:0
0x6bee26 fancy_abort(char const*, int, char const*)
???:0
0x697469 _fatal_insn(char const*, rtx_def const*, char const*, int, char
const*)
???:0
0x697485 _fatal_insn_not_found(rtx_def const*, char const*, int, char
const*)
???:0
0xbef198 extract_constrain_insn(rtx_insn*)
???:0
On Tue, Sep 12, 2023 at 7:24 PM Ross Burton <ross.burton@arm.com> wrote:
> From: Ross Burton <ross.burton@arm.com>
>
> This series of patches fixes deficiencies in GCC's -fstack-protector
> implementation for AArch64 when using dynamically allocated stack space.
> This is CVE-2023-4039. See:
>
>
> https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64
>
> https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf
>
> for more details.
>
> Signed-off-by: Ross Burton <ross.burton@arm.com>
> ---
> meta/recipes-devtools/gcc/gcc-12.3.inc | 1 +
> .../gcc/gcc/CVE-2023-4039.patch | 3093 +++++++++++++++++
> 2 files changed, 3094 insertions(+)
> create mode 100644 meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch
>
> diff --git a/meta/recipes-devtools/gcc/gcc-12.3.inc
> b/meta/recipes-devtools/gcc/gcc-12.3.inc
> index 4ec03f925c8..5896f26e1af 100644
> --- a/meta/recipes-devtools/gcc/gcc-12.3.inc
> +++ b/meta/recipes-devtools/gcc/gcc-12.3.inc
> @@ -63,6 +63,7 @@ SRC_URI = "${BASEURI} \
> file://0026-rust-recursion-limit.patch \
> file://prefix-map-realpath.patch \
> file://hardcoded-paths.patch \
> + file://CVE-2023-4039.patch \
> "
> SRC_URI[sha256sum] =
> "949a5d4f99e786421a93b532b22ffab5578de7321369975b91aec97adfda8c3b"
>
> diff --git a/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch
> b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch
> new file mode 100644
> index 00000000000..8cb52849cd3
> --- /dev/null
> +++ b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch
> @@ -0,0 +1,3093 @@
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Subject: [PATCH 00/19] aarch64: Fix -fstack-protector issue
> +Date: Tue, 12 Sep 2023 16:25:10 +0100
> +
> +This series of patches fixes deficiencies in GCC's -fstack-protector
> +implementation for AArch64 when using dynamically allocated stack space.
> +This is CVE-2023-4039. See:
> +
> +
> https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64
> +
> https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf
> +
> +for more details.
> +
> +The fix is to put the saved registers above the locals area when
> +-fstack-protector is used.
> +
> +The series also fixes a stack-clash problem that I found while working
> +on the CVE. In unpatched sources, the stack-clash problem would only
> +trigger for unrealistic numbers of arguments (8K 64-bit arguments, or an
> +equivalent). But it would be a more significant issue with the new
> +-fstack-protector frame layout. It's therefore important that both
> +problems are fixed together.
> +
> +Some reorganisation of the code seemed necessary to fix the problems in a
> +cleanish way. The series is therefore quite long, but only a handful of
> +patches should have any effect on code generation.
> +
> +See the individual patches for a detailed description.
> +
> +Tested on aarch64-linux-gnu. Pushed to trunk and to all active branches.
> +I've also pushed backports to GCC 7+ to vendors/ARM/heads/CVE-2023-4039.
> +
> +CVE: CVE-2023-4039
> +Upstream-Status: Backport
> +Signed-off-by: Ross Burton <ross.burton@arm.com>
> +
> +
> +From 62fbb215cc817e9f2c1ca80282a64f4ee30806bc Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:48 +0100
> +Subject: [PATCH 01/19] aarch64: Use local frame vars in shrink-wrapping
> code
> +
> +aarch64_layout_frame uses a shorthand for referring to
> +cfun->machine->frame:
> +
> + aarch64_frame &frame = cfun->machine->frame;
> +
> +This patch does the same for some other heavy users of the structure.
> +No functional change intended.
> +
> +gcc/
> + * config/aarch64/aarch64.cc (aarch64_save_callee_saves): Use
> + a local shorthand for cfun->machine->frame.
> + (aarch64_restore_callee_saves, aarch64_get_separate_components):
> + (aarch64_process_components): Likewise.
> + (aarch64_allocate_and_probe_stack_space): Likewise.
> + (aarch64_expand_prologue, aarch64_expand_epilogue): Likewise.
> + (aarch64_layout_frame): Use existing shorthand for one more case.
> +---
> + gcc/config/aarch64/aarch64.cc | 123 ++++++++++++++++++----------------
> + 1 file changed, 64 insertions(+), 59 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 226dc9dffd4..ae42ffdedbe 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8351,7 +8351,7 @@ aarch64_layout_frame (void)
> + frame.is_scs_enabled
> + = (!crtl->calls_eh_return
> + && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
> +- && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
> ++ && known_ge (frame.reg_offset[LR_REGNUM], 0));
> +
> + /* When shadow call stack is enabled, the scs_pop in the epilogue will
> + restore x30, and we don't need to pop x30 again in the traditional
> +@@ -8763,6 +8763,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
> + unsigned start, unsigned limit, bool skip_wb,
> + bool hard_fp_valid_p)
> + {
> ++ aarch64_frame &frame = cfun->machine->frame;
> + rtx_insn *insn;
> + unsigned regno;
> + unsigned regno2;
> +@@ -8777,8 +8778,8 @@ aarch64_save_callee_saves (poly_int64 start_offset,
> + bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
> +
> + if (skip_wb
> +- && (regno == cfun->machine->frame.wb_push_candidate1
> +- || regno == cfun->machine->frame.wb_push_candidate2))
> ++ && (regno == frame.wb_push_candidate1
> ++ || regno == frame.wb_push_candidate2))
> + continue;
> +
> + if (cfun->machine->reg_is_wrapped_separately[regno])
> +@@ -8786,7 +8787,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
> +
> + machine_mode mode = aarch64_reg_save_mode (regno);
> + reg = gen_rtx_REG (mode, regno);
> +- offset = start_offset + cfun->machine->frame.reg_offset[regno];
> ++ offset = start_offset + frame.reg_offset[regno];
> + rtx base_rtx = stack_pointer_rtx;
> + poly_int64 sp_offset = offset;
> +
> +@@ -8799,7 +8800,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
> + {
> + gcc_assert (known_eq (start_offset, 0));
> + poly_int64 fp_offset
> +- = cfun->machine->frame.below_hard_fp_saved_regs_size;
> ++ = frame.below_hard_fp_saved_regs_size;
> + if (hard_fp_valid_p)
> + base_rtx = hard_frame_pointer_rtx;
> + else
> +@@ -8821,8 +8822,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
> + && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <=
> limit
> + && !cfun->machine->reg_is_wrapped_separately[regno2]
> + && known_eq (GET_MODE_SIZE (mode),
> +- cfun->machine->frame.reg_offset[regno2]
> +- - cfun->machine->frame.reg_offset[regno]))
> ++ frame.reg_offset[regno2] - frame.reg_offset[regno]))
> + {
> + rtx reg2 = gen_rtx_REG (mode, regno2);
> + rtx mem2;
> +@@ -8872,6 +8872,7 @@ static void
> + aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
> + unsigned limit, bool skip_wb, rtx *cfi_ops)
> + {
> ++ aarch64_frame &frame = cfun->machine->frame;
> + unsigned regno;
> + unsigned regno2;
> + poly_int64 offset;
> +@@ -8888,13 +8889,13 @@ aarch64_restore_callee_saves (poly_int64
> start_offset, unsigned start,
> + rtx reg, mem;
> +
> + if (skip_wb
> +- && (regno == cfun->machine->frame.wb_pop_candidate1
> +- || regno == cfun->machine->frame.wb_pop_candidate2))
> ++ && (regno == frame.wb_pop_candidate1
> ++ || regno == frame.wb_pop_candidate2))
> + continue;
> +
> + machine_mode mode = aarch64_reg_save_mode (regno);
> + reg = gen_rtx_REG (mode, regno);
> +- offset = start_offset + cfun->machine->frame.reg_offset[regno];
> ++ offset = start_offset + frame.reg_offset[regno];
> + rtx base_rtx = stack_pointer_rtx;
> + if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
> + aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
> +@@ -8905,8 +8906,7 @@ aarch64_restore_callee_saves (poly_int64
> start_offset, unsigned start,
> + && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <=
> limit
> + && !cfun->machine->reg_is_wrapped_separately[regno2]
> + && known_eq (GET_MODE_SIZE (mode),
> +- cfun->machine->frame.reg_offset[regno2]
> +- - cfun->machine->frame.reg_offset[regno]))
> ++ frame.reg_offset[regno2] - frame.reg_offset[regno]))
> + {
> + rtx reg2 = gen_rtx_REG (mode, regno2);
> + rtx mem2;
> +@@ -9011,6 +9011,7 @@ offset_12bit_unsigned_scaled_p (machine_mode mode,
> poly_int64 offset)
> + static sbitmap
> + aarch64_get_separate_components (void)
> + {
> ++ aarch64_frame &frame = cfun->machine->frame;
> + sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
> + bitmap_clear (components);
> +
> +@@ -9027,18 +9028,18 @@ aarch64_get_separate_components (void)
> + if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
> + continue;
> +
> +- poly_int64 offset = cfun->machine->frame.reg_offset[regno];
> ++ poly_int64 offset = frame.reg_offset[regno];
> +
> + /* If the register is saved in the first SVE save slot, we use
> + it as a stack probe for -fstack-clash-protection. */
> + if (flag_stack_clash_protection
> +- && maybe_ne
> (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
> ++ && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
> + && known_eq (offset, 0))
> + continue;
> +
> + /* Get the offset relative to the register we'll use. */
> + if (frame_pointer_needed)
> +- offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
> ++ offset -= frame.below_hard_fp_saved_regs_size;
> + else
> + offset += crtl->outgoing_args_size;
> +
> +@@ -9057,11 +9058,11 @@ aarch64_get_separate_components (void)
> + /* If the spare predicate register used by big-endian SVE code
> + is call-preserved, it must be saved in the main prologue
> + before any saves that use it. */
> +- if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
> +- bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
> ++ if (frame.spare_pred_reg != INVALID_REGNUM)
> ++ bitmap_clear_bit (components, frame.spare_pred_reg);
> +
> +- unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
> +- unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
> ++ unsigned reg1 = frame.wb_push_candidate1;
> ++ unsigned reg2 = frame.wb_push_candidate2;
> + /* If registers have been chosen to be stored/restored with
> + writeback don't interfere with them to avoid having to output
> explicit
> + stack adjustment instructions. */
> +@@ -9170,6 +9171,7 @@ aarch64_get_next_set_bit (sbitmap bmp, unsigned int
> start)
> + static void
> + aarch64_process_components (sbitmap components, bool prologue_p)
> + {
> ++ aarch64_frame &frame = cfun->machine->frame;
> + rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
> + ? HARD_FRAME_POINTER_REGNUM
> + : STACK_POINTER_REGNUM);
> +@@ -9184,9 +9186,9 @@ aarch64_process_components (sbitmap components,
> bool prologue_p)
> + machine_mode mode = aarch64_reg_save_mode (regno);
> +
> + rtx reg = gen_rtx_REG (mode, regno);
> +- poly_int64 offset = cfun->machine->frame.reg_offset[regno];
> ++ poly_int64 offset = frame.reg_offset[regno];
> + if (frame_pointer_needed)
> +- offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
> ++ offset -= frame.below_hard_fp_saved_regs_size;
> + else
> + offset += crtl->outgoing_args_size;
> +
> +@@ -9211,14 +9213,14 @@ aarch64_process_components (sbitmap components,
> bool prologue_p)
> + break;
> + }
> +
> +- poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
> ++ poly_int64 offset2 = frame.reg_offset[regno2];
> + /* The next register is not of the same class or its offset is not
> + mergeable with the current one into a pair. */
> + if (aarch64_sve_mode_p (mode)
> + || !satisfies_constraint_Ump (mem)
> + || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
> + || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
> +- || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
> ++ || maybe_ne ((offset2 - frame.reg_offset[regno]),
> + GET_MODE_SIZE (mode)))
> + {
> + insn = emit_insn (set);
> +@@ -9240,7 +9242,7 @@ aarch64_process_components (sbitmap components,
> bool prologue_p)
> + /* REGNO2 can be saved/restored in a pair with REGNO. */
> + rtx reg2 = gen_rtx_REG (mode, regno2);
> + if (frame_pointer_needed)
> +- offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
> ++ offset2 -= frame.below_hard_fp_saved_regs_size;
> + else
> + offset2 += crtl->outgoing_args_size;
> + rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
> +@@ -9335,6 +9337,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1,
> rtx temp2,
> + bool frame_related_p,
> + bool final_adjustment_p)
> + {
> ++ aarch64_frame &frame = cfun->machine->frame;
> + HOST_WIDE_INT guard_size
> + = 1 << param_stack_clash_protection_guard_size;
> + HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
> +@@ -9355,25 +9358,25 @@ aarch64_allocate_and_probe_stack_space (rtx
> temp1, rtx temp2,
> + register as a probe. We can't assume that LR was saved at
> position 0
> + though, so treat any space below it as unprobed. */
> + if (final_adjustment_p
> +- && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size,
> 0))
> ++ && known_eq (frame.below_hard_fp_saved_regs_size, 0))
> + {
> +- poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
> ++ poly_int64 lr_offset = frame.reg_offset[LR_REGNUM];
> + if (known_ge (lr_offset, 0))
> + min_probe_threshold -= lr_offset.to_constant ();
> + else
> + gcc_assert (!flag_stack_clash_protection || known_eq (poly_size,
> 0));
> + }
> +
> +- poly_int64 frame_size = cfun->machine->frame.frame_size;
> ++ poly_int64 frame_size = frame.frame_size;
> +
> + /* We should always have a positive probe threshold. */
> + gcc_assert (min_probe_threshold > 0);
> +
> + if (flag_stack_clash_protection && !final_adjustment_p)
> + {
> +- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
> +- poly_int64 sve_callee_adjust =
> cfun->machine->frame.sve_callee_adjust;
> +- poly_int64 final_adjust = cfun->machine->frame.final_adjust;
> ++ poly_int64 initial_adjust = frame.initial_adjust;
> ++ poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
> ++ poly_int64 final_adjust = frame.final_adjust;
> +
> + if (known_eq (frame_size, 0))
> + {
> +@@ -9662,17 +9665,18 @@ aarch64_epilogue_uses (int regno)
> + void
> + aarch64_expand_prologue (void)
> + {
> +- poly_int64 frame_size = cfun->machine->frame.frame_size;
> +- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
> +- HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
> +- poly_int64 final_adjust = cfun->machine->frame.final_adjust;
> +- poly_int64 callee_offset = cfun->machine->frame.callee_offset;
> +- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
> ++ aarch64_frame &frame = cfun->machine->frame;
> ++ poly_int64 frame_size = frame.frame_size;
> ++ poly_int64 initial_adjust = frame.initial_adjust;
> ++ HOST_WIDE_INT callee_adjust = frame.callee_adjust;
> ++ poly_int64 final_adjust = frame.final_adjust;
> ++ poly_int64 callee_offset = frame.callee_offset;
> ++ poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
> + poly_int64 below_hard_fp_saved_regs_size
> +- = cfun->machine->frame.below_hard_fp_saved_regs_size;
> +- unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
> +- unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
> +- bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
> ++ = frame.below_hard_fp_saved_regs_size;
> ++ unsigned reg1 = frame.wb_push_candidate1;
> ++ unsigned reg2 = frame.wb_push_candidate2;
> ++ bool emit_frame_chain = frame.emit_frame_chain;
> + rtx_insn *insn;
> +
> + if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
> +@@ -9703,7 +9707,7 @@ aarch64_expand_prologue (void)
> + }
> +
> + /* Push return address to shadow call stack. */
> +- if (cfun->machine->frame.is_scs_enabled)
> ++ if (frame.is_scs_enabled)
> + emit_insn (gen_scs_push ());
> +
> + if (flag_stack_usage_info)
> +@@ -9742,7 +9746,7 @@ aarch64_expand_prologue (void)
> +
> + /* The offset of the frame chain record (if any) from the current SP.
> */
> + poly_int64 chain_offset = (initial_adjust + callee_adjust
> +- - cfun->machine->frame.hard_fp_offset);
> ++ - frame.hard_fp_offset);
> + gcc_assert (known_ge (chain_offset, 0));
> +
> + /* The offset of the bottom of the save area from the current SP. */
> +@@ -9845,16 +9849,17 @@ aarch64_use_return_insn_p (void)
> + void
> + aarch64_expand_epilogue (bool for_sibcall)
> + {
> +- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
> +- HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
> +- poly_int64 final_adjust = cfun->machine->frame.final_adjust;
> +- poly_int64 callee_offset = cfun->machine->frame.callee_offset;
> +- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
> ++ aarch64_frame &frame = cfun->machine->frame;
> ++ poly_int64 initial_adjust = frame.initial_adjust;
> ++ HOST_WIDE_INT callee_adjust = frame.callee_adjust;
> ++ poly_int64 final_adjust = frame.final_adjust;
> ++ poly_int64 callee_offset = frame.callee_offset;
> ++ poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
> + poly_int64 below_hard_fp_saved_regs_size
> +- = cfun->machine->frame.below_hard_fp_saved_regs_size;
> +- unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
> +- unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
> +- unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
> ++ = frame.below_hard_fp_saved_regs_size;
> ++ unsigned reg1 = frame.wb_pop_candidate1;
> ++ unsigned reg2 = frame.wb_pop_candidate2;
> ++ unsigned int last_gpr = (frame.is_scs_enabled
> + ? R29_REGNUM : R30_REGNUM);
> + rtx cfi_ops = NULL;
> + rtx_insn *insn;
> +@@ -9888,7 +9893,7 @@ aarch64_expand_epilogue (bool for_sibcall)
> + /* We need to add memory barrier to prevent read from deallocated
> stack. */
> + bool need_barrier_p
> + = maybe_ne (get_frame_size ()
> +- + cfun->machine->frame.saved_varargs_size, 0);
> ++ + frame.saved_varargs_size, 0);
> +
> + /* Emit a barrier to prevent loads from a deallocated stack. */
> + if (maybe_gt (final_adjust, crtl->outgoing_args_size)
> +@@ -9969,7 +9974,7 @@ aarch64_expand_epilogue (bool for_sibcall)
> + }
> +
> + /* Pop return address from shadow call stack. */
> +- if (cfun->machine->frame.is_scs_enabled)
> ++ if (frame.is_scs_enabled)
> + {
> + machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
> + rtx reg = gen_rtx_REG (mode, R30_REGNUM);
> +@@ -12564,24 +12569,24 @@ aarch64_can_eliminate (const int from
> ATTRIBUTE_UNUSED, const int to)
> + poly_int64
> + aarch64_initial_elimination_offset (unsigned from, unsigned to)
> + {
> ++ aarch64_frame &frame = cfun->machine->frame;
> ++
> + if (to == HARD_FRAME_POINTER_REGNUM)
> + {
> + if (from == ARG_POINTER_REGNUM)
> +- return cfun->machine->frame.hard_fp_offset;
> ++ return frame.hard_fp_offset;
> +
> + if (from == FRAME_POINTER_REGNUM)
> +- return cfun->machine->frame.hard_fp_offset
> +- - cfun->machine->frame.locals_offset;
> ++ return frame.hard_fp_offset - frame.locals_offset;
> + }
> +
> + if (to == STACK_POINTER_REGNUM)
> + {
> + if (from == FRAME_POINTER_REGNUM)
> +- return cfun->machine->frame.frame_size
> +- - cfun->machine->frame.locals_offset;
> ++ return frame.frame_size - frame.locals_offset;
> + }
> +
> +- return cfun->machine->frame.frame_size;
> ++ return frame.frame_size;
> + }
> +
> +
> +--
> +2.34.1
> +
> +
> +From 12a8889de169f892d2e927584c00d20b8b7e456f Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:49 +0100
> +Subject: [PATCH 02/19] aarch64: Avoid a use of callee_offset
> +
> +When we emit the frame chain, i.e. when we reach Here in this statement
> +of aarch64_expand_prologue:
> +
> + if (emit_frame_chain)
> + {
> + // Here
> + ...
> + }
> +
> +the stack is in one of two states:
> +
> +- We've allocated up to the frame chain, but no more.
> +
> +- We've allocated the whole frame, and the frame chain is within easy
> + reach of the new SP.
> +
> +The offset of the frame chain from the current SP is available
> +in aarch64_frame as callee_offset. It is also available as the
> +chain_offset local variable, where the latter is calculated from other
> +data. (However, chain_offset is not always equal to callee_offset when
> +!emit_frame_chain, so chain_offset isn't redundant.)
> +
> +In c600df9a4060da3c6121ff4d0b93f179eafd69d1 I switched to using
> +chain_offset for the initialisation of the hard frame pointer:
> +
> + aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
> +- stack_pointer_rtx, callee_offset,
> ++ stack_pointer_rtx, chain_offset,
> + tmp1_rtx, tmp0_rtx, frame_pointer_needed);
> +
> +But the later REG_CFA_ADJUST_CFA handling still used callee_offset.
> +
> +I think the difference is harmless, but it's more logical for the
> +CFA note to be in sync, and it's more convenient for later patches
> +if it uses chain_offset.
> +
> +gcc/
> + * config/aarch64/aarch64.cc (aarch64_expand_prologue): Use
> + chain_offset rather than callee_offset.
> +---
> + gcc/config/aarch64/aarch64.cc | 4 +---
> + 1 file changed, 1 insertion(+), 3 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index ae42ffdedbe..79253322fd7 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -9670,7 +9670,6 @@ aarch64_expand_prologue (void)
> + poly_int64 initial_adjust = frame.initial_adjust;
> + HOST_WIDE_INT callee_adjust = frame.callee_adjust;
> + poly_int64 final_adjust = frame.final_adjust;
> +- poly_int64 callee_offset = frame.callee_offset;
> + poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
> + poly_int64 below_hard_fp_saved_regs_size
> + = frame.below_hard_fp_saved_regs_size;
> +@@ -9783,8 +9782,7 @@ aarch64_expand_prologue (void)
> + implicit. */
> + if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
> + {
> +- rtx src = plus_constant (Pmode, stack_pointer_rtx,
> +- callee_offset);
> ++ rtx src = plus_constant (Pmode, stack_pointer_rtx,
> chain_offset);
> + add_reg_note (insn, REG_CFA_ADJUST_CFA,
> + gen_rtx_SET (hard_frame_pointer_rtx, src));
> + }
> +--
> +2.34.1
> +
> +
> +From 03d5e89e7f3be53fd7142556e8e0a2774c653dca Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:49 +0100
> +Subject: [PATCH 03/19] aarch64: Explicitly handle frames with no saved
> + registers
> +
> +If a frame has no saved registers, it can be allocated in one go.
> +There is no need to treat the areas below and above the saved
> +registers as separate.
> +
> +And if we allocate the frame in one go, it should be allocated
> +as the initial_adjust rather than the final_adjust. This allows the
> +frame size to grow to guard_size - guard_used_by_caller before a stack
> +probe is needed. (A frame with no register saves is necessarily a
> +leaf frame.)
> +
> +This is a no-op as thing stand, since a leaf function will have
> +no outgoing arguments, and so all the frame will be above where
> +the saved registers normally go.
> +
> +gcc/
> + * config/aarch64/aarch64.cc (aarch64_layout_frame): Explicitly
> + allocate the frame in one go if there are no saved registers.
> +---
> + gcc/config/aarch64/aarch64.cc | 8 +++++---
> + 1 file changed, 5 insertions(+), 3 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 79253322fd7..e1f21230c15 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8378,9 +8378,11 @@ aarch64_layout_frame (void)
> +
> + HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
> + HOST_WIDE_INT const_saved_regs_size;
> +- if (frame.frame_size.is_constant (&const_size)
> +- && const_size < max_push_offset
> +- && known_eq (frame.hard_fp_offset, const_size))
> ++ if (known_eq (frame.saved_regs_size, 0))
> ++ frame.initial_adjust = frame.frame_size;
> ++ else if (frame.frame_size.is_constant (&const_size)
> ++ && const_size < max_push_offset
> ++ && known_eq (frame.hard_fp_offset, const_size))
> + {
> + /* Simple, small frame with no outgoing arguments:
> +
> +--
> +2.34.1
> +
> +
> +From 49c2eb7616756c323b7f6b18d8616ec945eb1263 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:49 +0100
> +Subject: [PATCH 04/19] aarch64: Add bytes_below_saved_regs to frame info
> +
> +The frame layout code currently hard-codes the assumption that
> +the number of bytes below the saved registers is equal to the
> +size of the outgoing arguments. This patch abstracts that
> +value into a new field of aarch64_frame.
> +
> +gcc/
> + * config/aarch64/aarch64.h
> (aarch64_frame::bytes_below_saved_regs): New
> + field.
> + * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it,
> + and use it instead of crtl->outgoing_args_size.
> + (aarch64_get_separate_components): Use bytes_below_saved_regs
> instead
> + of outgoing_args_size.
> + (aarch64_process_components): Likewise.
> +---
> + gcc/config/aarch64/aarch64.cc | 71 ++++++++++++++++++-----------------
> + gcc/config/aarch64/aarch64.h | 5 +++
> + 2 files changed, 41 insertions(+), 35 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index e1f21230c15..94e1b686584 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8217,6 +8217,8 @@ aarch64_layout_frame (void)
> + gcc_assert (crtl->is_leaf
> + || maybe_ne (frame.reg_offset[R30_REGNUM],
> SLOT_NOT_REQUIRED));
> +
> ++ frame.bytes_below_saved_regs = crtl->outgoing_args_size;
> ++
> + /* Now assign stack slots for the registers. Start with the predicate
> + registers, since predicate LDR and STR have a relatively small
> + offset range. These saves happen below the hard frame pointer. */
> +@@ -8321,18 +8323,18 @@ aarch64_layout_frame (void)
> +
> + poly_int64 varargs_and_saved_regs_size = offset +
> frame.saved_varargs_size;
> +
> +- poly_int64 above_outgoing_args
> ++ poly_int64 saved_regs_and_above
> + = aligned_upper_bound (varargs_and_saved_regs_size
> + + get_frame_size (),
> + STACK_BOUNDARY / BITS_PER_UNIT);
> +
> + frame.hard_fp_offset
> +- = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
> ++ = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
> +
> + /* Both these values are already aligned. */
> +- gcc_assert (multiple_p (crtl->outgoing_args_size,
> ++ gcc_assert (multiple_p (frame.bytes_below_saved_regs,
> + STACK_BOUNDARY / BITS_PER_UNIT));
> +- frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
> ++ frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
> +
> + frame.locals_offset = frame.saved_varargs_size;
> +
> +@@ -8376,7 +8378,7 @@ aarch64_layout_frame (void)
> + else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
> + max_push_offset = 256;
> +
> +- HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
> ++ HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset;
> + HOST_WIDE_INT const_saved_regs_size;
> + if (known_eq (frame.saved_regs_size, 0))
> + frame.initial_adjust = frame.frame_size;
> +@@ -8384,31 +8386,31 @@ aarch64_layout_frame (void)
> + && const_size < max_push_offset
> + && known_eq (frame.hard_fp_offset, const_size))
> + {
> +- /* Simple, small frame with no outgoing arguments:
> ++ /* Simple, small frame with no data below the saved registers.
> +
> + stp reg1, reg2, [sp, -frame_size]!
> + stp reg3, reg4, [sp, 16] */
> + frame.callee_adjust = const_size;
> + }
> +- else if (crtl->outgoing_args_size.is_constant
> (&const_outgoing_args_size)
> ++ else if (frame.bytes_below_saved_regs.is_constant
> (&const_below_saved_regs)
> + && frame.saved_regs_size.is_constant (&const_saved_regs_size)
> +- && const_outgoing_args_size + const_saved_regs_size < 512
> +- /* We could handle this case even with outgoing args, provided
> +- that the number of args left us with valid offsets for all
> +- predicate and vector save slots. It's such a rare case that
> +- it hardly seems worth the effort though. */
> +- && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
> ++ && const_below_saved_regs + const_saved_regs_size < 512
> ++ /* We could handle this case even with data below the saved
> ++ registers, provided that that data left us with valid offsets
> ++ for all predicate and vector save slots. It's such a rare
> ++ case that it hardly seems worth the effort though. */
> ++ && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
> + && !(cfun->calls_alloca
> + && frame.hard_fp_offset.is_constant (&const_fp_offset)
> + && const_fp_offset < max_push_offset))
> + {
> +- /* Frame with small outgoing arguments:
> ++ /* Frame with small area below the saved registers:
> +
> + sub sp, sp, frame_size
> +- stp reg1, reg2, [sp, outgoing_args_size]
> +- stp reg3, reg4, [sp, outgoing_args_size + 16] */
> ++ stp reg1, reg2, [sp, bytes_below_saved_regs]
> ++ stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */
> + frame.initial_adjust = frame.frame_size;
> +- frame.callee_offset = const_outgoing_args_size;
> ++ frame.callee_offset = const_below_saved_regs;
> + }
> + else if (saves_below_hard_fp_p
> + && known_eq (frame.saved_regs_size,
> +@@ -8418,30 +8420,29 @@ aarch64_layout_frame (void)
> +
> + sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
> + save SVE registers relative to SP
> +- sub sp, sp, outgoing_args_size */
> ++ sub sp, sp, bytes_below_saved_regs */
> + frame.initial_adjust = (frame.hard_fp_offset
> + + frame.below_hard_fp_saved_regs_size);
> +- frame.final_adjust = crtl->outgoing_args_size;
> ++ frame.final_adjust = frame.bytes_below_saved_regs;
> + }
> + else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
> + && const_fp_offset < max_push_offset)
> + {
> +- /* Frame with large outgoing arguments or SVE saves, but with
> +- a small local area:
> ++ /* Frame with large area below the saved registers, or with SVE
> saves,
> ++ but with a small area above:
> +
> + stp reg1, reg2, [sp, -hard_fp_offset]!
> + stp reg3, reg4, [sp, 16]
> + [sub sp, sp, below_hard_fp_saved_regs_size]
> + [save SVE registers relative to SP]
> +- sub sp, sp, outgoing_args_size */
> ++ sub sp, sp, bytes_below_saved_regs */
> + frame.callee_adjust = const_fp_offset;
> + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
> +- frame.final_adjust = crtl->outgoing_args_size;
> ++ frame.final_adjust = frame.bytes_below_saved_regs;
> + }
> + else
> + {
> +- /* Frame with large local area and outgoing arguments or SVE saves,
> +- using frame pointer:
> ++ /* General case:
> +
> + sub sp, sp, hard_fp_offset
> + stp x29, x30, [sp, 0]
> +@@ -8449,10 +8450,10 @@ aarch64_layout_frame (void)
> + stp reg3, reg4, [sp, 16]
> + [sub sp, sp, below_hard_fp_saved_regs_size]
> + [save SVE registers relative to SP]
> +- sub sp, sp, outgoing_args_size */
> ++ sub sp, sp, bytes_below_saved_regs */
> + frame.initial_adjust = frame.hard_fp_offset;
> + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
> +- frame.final_adjust = crtl->outgoing_args_size;
> ++ frame.final_adjust = frame.bytes_below_saved_regs;
> + }
> +
> + /* Make sure the individual adjustments add up to the full frame
> size. */
> +@@ -9043,7 +9044,7 @@ aarch64_get_separate_components (void)
> + if (frame_pointer_needed)
> + offset -= frame.below_hard_fp_saved_regs_size;
> + else
> +- offset += crtl->outgoing_args_size;
> ++ offset += frame.bytes_below_saved_regs;
> +
> + /* Check that we can access the stack slot of the register with one
> + direct load with no adjustments needed. */
> +@@ -9192,7 +9193,7 @@ aarch64_process_components (sbitmap components,
> bool prologue_p)
> + if (frame_pointer_needed)
> + offset -= frame.below_hard_fp_saved_regs_size;
> + else
> +- offset += crtl->outgoing_args_size;
> ++ offset += frame.bytes_below_saved_regs;
> +
> + rtx addr = plus_constant (Pmode, ptr_reg, offset);
> + rtx mem = gen_frame_mem (mode, addr);
> +@@ -9246,7 +9247,7 @@ aarch64_process_components (sbitmap components,
> bool prologue_p)
> + if (frame_pointer_needed)
> + offset2 -= frame.below_hard_fp_saved_regs_size;
> + else
> +- offset2 += crtl->outgoing_args_size;
> ++ offset2 += frame.bytes_below_saved_regs;
> + rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
> + rtx mem2 = gen_frame_mem (mode, addr2);
> + rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
> +@@ -9320,10 +9321,10 @@ aarch64_stack_clash_protection_alloca_probe_range
> (void)
> + registers. If POLY_SIZE is not large enough to require a probe this
> function
> + will only adjust the stack. When allocating the stack space
> + FRAME_RELATED_P is then used to indicate if the allocation is frame
> related.
> +- FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
> +- arguments. If we are then we ensure that any allocation larger than
> the ABI
> +- defined buffer needs a probe so that the invariant of having a 1KB
> buffer is
> +- maintained.
> ++ FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
> ++ the saved registers. If we are then we ensure that any allocation
> ++ larger than the ABI defined buffer needs a probe so that the
> ++ invariant of having a 1KB buffer is maintained.
> +
> + We emit barriers after each stack adjustment to prevent optimizations
> from
> + breaking the invariant that we never drop the stack more than a
> page. This
> +@@ -9532,7 +9533,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1,
> rtx temp2,
> + /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD
> have to
> + be probed. This maintains the requirement that each page is probed
> at
> + least once. For initial probing we probe only if the allocation is
> +- more than GUARD_SIZE - buffer, and for the outgoing arguments we
> probe
> ++ more than GUARD_SIZE - buffer, and below the saved registers we
> probe
> + if the amount is larger than buffer. GUARD_SIZE - buffer + buffer
> ==
> + GUARD_SIZE. This works that for any allocation that is large
> enough to
> + trigger a probe here, we'll have at least one, and if they're not
> large
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index 6834c3e9922..1e105e12db8 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -871,6 +871,11 @@ struct GTY (()) aarch64_frame
> + /* The size of the callee-save registers with a slot in REG_OFFSET. */
> + poly_int64 saved_regs_size;
> +
> ++ /* The number of bytes between the bottom of the static frame (the
> bottom
> ++ of the outgoing arguments) and the bottom of the register save area.
> ++ This value is always a multiple of STACK_BOUNDARY. */
> ++ poly_int64 bytes_below_saved_regs;
> ++
> + /* The size of the callee-save registers with a slot in REG_OFFSET that
> + are saved below the hard frame pointer. */
> + poly_int64 below_hard_fp_saved_regs_size;
> +--
> +2.34.1
> +
> +
> +From 34081079ea4de0c98331843f574b5f6f94d7b234 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:50 +0100
> +Subject: [PATCH 05/19] aarch64: Add bytes_below_hard_fp to frame info
> +
> +Following on from the previous bytes_below_saved_regs patch, this one
> +records the number of bytes that are below the hard frame pointer.
> +This eventually replaces below_hard_fp_saved_regs_size.
> +
> +If a frame pointer is not needed, the epilogue adds final_adjust
> +to the stack pointer before restoring registers:
> +
> + aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
> +
> +Therefore, if the epilogue needs to restore the stack pointer from
> +the hard frame pointer, the directly corresponding offset is:
> +
> + -bytes_below_hard_fp + final_adjust
> +
> +i.e. go from the hard frame pointer to the bottom of the frame,
> +then add the same amount as if we were using the stack pointer
> +from the outset.
> +
> +gcc/
> + * config/aarch64/aarch64.h (aarch64_frame::bytes_below_hard_fp):
> New
> + field.
> + * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it.
> + (aarch64_expand_epilogue): Use it instead of
> + below_hard_fp_saved_regs_size.
> +---
> + gcc/config/aarch64/aarch64.cc | 6 +++---
> + gcc/config/aarch64/aarch64.h | 5 +++++
> + 2 files changed, 8 insertions(+), 3 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 94e1b686584..c7d84245fbf 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8269,6 +8269,7 @@ aarch64_layout_frame (void)
> + of the callee save area. */
> + bool saves_below_hard_fp_p = maybe_ne (offset, 0);
> + frame.below_hard_fp_saved_regs_size = offset;
> ++ frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs;
> + if (frame.emit_frame_chain)
> + {
> + /* FP and LR are placed in the linkage record. */
> +@@ -9856,8 +9857,7 @@ aarch64_expand_epilogue (bool for_sibcall)
> + poly_int64 final_adjust = frame.final_adjust;
> + poly_int64 callee_offset = frame.callee_offset;
> + poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
> +- poly_int64 below_hard_fp_saved_regs_size
> +- = frame.below_hard_fp_saved_regs_size;
> ++ poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
> + unsigned reg1 = frame.wb_pop_candidate1;
> + unsigned reg2 = frame.wb_pop_candidate2;
> + unsigned int last_gpr = (frame.is_scs_enabled
> +@@ -9915,7 +9915,7 @@ aarch64_expand_epilogue (bool for_sibcall)
> + is restored on the instruction doing the writeback. */
> + aarch64_add_offset (Pmode, stack_pointer_rtx,
> + hard_frame_pointer_rtx,
> +- -callee_offset - below_hard_fp_saved_regs_size,
> ++ -bytes_below_hard_fp + final_adjust,
> + tmp1_rtx, tmp0_rtx, callee_adjust == 0);
> + else
> + /* The case where we need to re-use the register here is very rare,
> so
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index 1e105e12db8..de68ff7202f 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -880,6 +880,11 @@ struct GTY (()) aarch64_frame
> + are saved below the hard frame pointer. */
> + poly_int64 below_hard_fp_saved_regs_size;
> +
> ++ /* The number of bytes between the bottom of the static frame (the
> bottom
> ++ of the outgoing arguments) and the hard frame pointer. This value
> is
> ++ always a multiple of STACK_BOUNDARY. */
> ++ poly_int64 bytes_below_hard_fp;
> ++
> + /* Offset from the base of the frame (incomming SP) to the
> + top of the locals area. This value is always a multiple of
> + STACK_BOUNDARY. */
> +--
> +2.34.1
> +
> +
> +From 187861af7c51db9eddc6f954b589c121b210fc74 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:50 +0100
> +Subject: [PATCH 06/19] aarch64: Tweak aarch64_save/restore_callee_saves
> +
> +aarch64_save_callee_saves and aarch64_restore_callee_saves took
> +a parameter called start_offset that gives the offset of the
> +bottom of the saved register area from the current stack pointer.
> +However, it's more convenient for later patches if we use the
> +bottom of the entire frame as the reference point, rather than
> +the bottom of the saved registers.
> +
> +Doing that removes the need for the callee_offset field.
> +Other than that, this is not a win on its own. It only really
> +makes sense in combination with the follow-on patches.
> +
> +gcc/
> + * config/aarch64/aarch64.h (aarch64_frame::callee_offset): Delete.
> + * config/aarch64/aarch64.cc (aarch64_layout_frame): Remove
> + callee_offset handling.
> + (aarch64_save_callee_saves): Replace the start_offset parameter
> + with a bytes_below_sp parameter.
> + (aarch64_restore_callee_saves): Likewise.
> + (aarch64_expand_prologue): Update accordingly.
> + (aarch64_expand_epilogue): Likewise.
> +---
> + gcc/config/aarch64/aarch64.cc | 56 +++++++++++++++++------------------
> + gcc/config/aarch64/aarch64.h | 4 ---
> + 2 files changed, 28 insertions(+), 32 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index c7d84245fbf..e79551af41d 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8343,7 +8343,6 @@ aarch64_layout_frame (void)
> + frame.final_adjust = 0;
> + frame.callee_adjust = 0;
> + frame.sve_callee_adjust = 0;
> +- frame.callee_offset = 0;
> +
> + frame.wb_pop_candidate1 = frame.wb_push_candidate1;
> + frame.wb_pop_candidate2 = frame.wb_push_candidate2;
> +@@ -8411,7 +8410,6 @@ aarch64_layout_frame (void)
> + stp reg1, reg2, [sp, bytes_below_saved_regs]
> + stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */
> + frame.initial_adjust = frame.frame_size;
> +- frame.callee_offset = const_below_saved_regs;
> + }
> + else if (saves_below_hard_fp_p
> + && known_eq (frame.saved_regs_size,
> +@@ -8758,12 +8756,13 @@ aarch64_add_cfa_expression (rtx_insn *insn, rtx
> reg,
> + }
> +
> + /* Emit code to save the callee-saved registers from register number
> START
> +- to LIMIT to the stack at the location starting at offset START_OFFSET,
> +- skipping any write-back candidates if SKIP_WB is true.
> HARD_FP_VALID_P
> +- is true if the hard frame pointer has been set up. */
> ++ to LIMIT to the stack. The stack pointer is currently BYTES_BELOW_SP
> ++ bytes above the bottom of the static frame. Skip any write-back
> ++ candidates if SKIP_WB is true. HARD_FP_VALID_P is true if the hard
> ++ frame pointer has been set up. */
> +
> + static void
> +-aarch64_save_callee_saves (poly_int64 start_offset,
> ++aarch64_save_callee_saves (poly_int64 bytes_below_sp,
> + unsigned start, unsigned limit, bool skip_wb,
> + bool hard_fp_valid_p)
> + {
> +@@ -8791,7 +8790,9 @@ aarch64_save_callee_saves (poly_int64 start_offset,
> +
> + machine_mode mode = aarch64_reg_save_mode (regno);
> + reg = gen_rtx_REG (mode, regno);
> +- offset = start_offset + frame.reg_offset[regno];
> ++ offset = (frame.reg_offset[regno]
> ++ + frame.bytes_below_saved_regs
> ++ - bytes_below_sp);
> + rtx base_rtx = stack_pointer_rtx;
> + poly_int64 sp_offset = offset;
> +
> +@@ -8802,9 +8803,7 @@ aarch64_save_callee_saves (poly_int64 start_offset,
> + else if (GP_REGNUM_P (regno)
> + && (!offset.is_constant (&const_offset) || const_offset >=
> 512))
> + {
> +- gcc_assert (known_eq (start_offset, 0));
> +- poly_int64 fp_offset
> +- = frame.below_hard_fp_saved_regs_size;
> ++ poly_int64 fp_offset = frame.bytes_below_hard_fp -
> bytes_below_sp;
> + if (hard_fp_valid_p)
> + base_rtx = hard_frame_pointer_rtx;
> + else
> +@@ -8868,12 +8867,13 @@ aarch64_save_callee_saves (poly_int64
> start_offset,
> + }
> +
> + /* Emit code to restore the callee registers from register number START
> +- up to and including LIMIT. Restore from the stack offset
> START_OFFSET,
> +- skipping any write-back candidates if SKIP_WB is true. Write the
> +- appropriate REG_CFA_RESTORE notes into CFI_OPS. */
> ++ up to and including LIMIT. The stack pointer is currently
> BYTES_BELOW_SP
> ++ bytes above the bottom of the static frame. Skip any write-back
> ++ candidates if SKIP_WB is true. Write the appropriate REG_CFA_RESTORE
> ++ notes into CFI_OPS. */
> +
> + static void
> +-aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
> ++aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start,
> + unsigned limit, bool skip_wb, rtx *cfi_ops)
> + {
> + aarch64_frame &frame = cfun->machine->frame;
> +@@ -8899,7 +8899,9 @@ aarch64_restore_callee_saves (poly_int64
> start_offset, unsigned start,
> +
> + machine_mode mode = aarch64_reg_save_mode (regno);
> + reg = gen_rtx_REG (mode, regno);
> +- offset = start_offset + frame.reg_offset[regno];
> ++ offset = (frame.reg_offset[regno]
> ++ + frame.bytes_below_saved_regs
> ++ - bytes_below_sp);
> + rtx base_rtx = stack_pointer_rtx;
> + if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
> + aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
> +@@ -9675,8 +9677,6 @@ aarch64_expand_prologue (void)
> + HOST_WIDE_INT callee_adjust = frame.callee_adjust;
> + poly_int64 final_adjust = frame.final_adjust;
> + poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
> +- poly_int64 below_hard_fp_saved_regs_size
> +- = frame.below_hard_fp_saved_regs_size;
> + unsigned reg1 = frame.wb_push_candidate1;
> + unsigned reg2 = frame.wb_push_candidate2;
> + bool emit_frame_chain = frame.emit_frame_chain;
> +@@ -9752,8 +9752,8 @@ aarch64_expand_prologue (void)
> + - frame.hard_fp_offset);
> + gcc_assert (known_ge (chain_offset, 0));
> +
> +- /* The offset of the bottom of the save area from the current SP. */
> +- poly_int64 saved_regs_offset = chain_offset -
> below_hard_fp_saved_regs_size;
> ++ /* The offset of the current SP from the bottom of the static frame.
> */
> ++ poly_int64 bytes_below_sp = frame_size - initial_adjust -
> callee_adjust;
> +
> + if (emit_frame_chain)
> + {
> +@@ -9761,7 +9761,7 @@ aarch64_expand_prologue (void)
> + {
> + reg1 = R29_REGNUM;
> + reg2 = R30_REGNUM;
> +- aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
> ++ aarch64_save_callee_saves (bytes_below_sp, reg1, reg2,
> + false, false);
> + }
> + else
> +@@ -9801,7 +9801,7 @@ aarch64_expand_prologue (void)
> + emit_insn (gen_stack_tie (stack_pointer_rtx,
> hard_frame_pointer_rtx));
> + }
> +
> +- aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
> ++ aarch64_save_callee_saves (bytes_below_sp, R0_REGNUM, R30_REGNUM,
> + callee_adjust != 0 || emit_frame_chain,
> + emit_frame_chain);
> + if (maybe_ne (sve_callee_adjust, 0))
> +@@ -9811,16 +9811,17 @@ aarch64_expand_prologue (void)
> + aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
> + sve_callee_adjust,
> + !frame_pointer_needed,
> false);
> +- saved_regs_offset += sve_callee_adjust;
> ++ bytes_below_sp -= sve_callee_adjust;
> + }
> +- aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
> ++ aarch64_save_callee_saves (bytes_below_sp, P0_REGNUM, P15_REGNUM,
> + false, emit_frame_chain);
> +- aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
> ++ aarch64_save_callee_saves (bytes_below_sp, V0_REGNUM, V31_REGNUM,
> + callee_adjust != 0 || emit_frame_chain,
> + emit_frame_chain);
> +
> + /* We may need to probe the final adjustment if it is larger than the
> guard
> + that is assumed by the called. */
> ++ gcc_assert (known_eq (bytes_below_sp, final_adjust));
> + aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
> final_adjust,
> + !frame_pointer_needed, true);
> + }
> +@@ -9855,7 +9856,6 @@ aarch64_expand_epilogue (bool for_sibcall)
> + poly_int64 initial_adjust = frame.initial_adjust;
> + HOST_WIDE_INT callee_adjust = frame.callee_adjust;
> + poly_int64 final_adjust = frame.final_adjust;
> +- poly_int64 callee_offset = frame.callee_offset;
> + poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
> + poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
> + unsigned reg1 = frame.wb_pop_candidate1;
> +@@ -9925,9 +9925,9 @@ aarch64_expand_epilogue (bool for_sibcall)
> +
> + /* Restore the vector registers before the predicate registers,
> + so that we can use P4 as a temporary for big-endian SVE frames. */
> +- aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
> ++ aarch64_restore_callee_saves (final_adjust, V0_REGNUM, V31_REGNUM,
> + callee_adjust != 0, &cfi_ops);
> +- aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
> ++ aarch64_restore_callee_saves (final_adjust, P0_REGNUM, P15_REGNUM,
> + false, &cfi_ops);
> + if (maybe_ne (sve_callee_adjust, 0))
> + aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
> +@@ -9935,7 +9935,7 @@ aarch64_expand_epilogue (bool for_sibcall)
> + /* When shadow call stack is enabled, the scs_pop in the epilogue will
> + restore x30, we don't need to restore x30 again in the traditional
> + way. */
> +- aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
> ++ aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
> + R0_REGNUM, last_gpr,
> + callee_adjust != 0, &cfi_ops);
> +
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index de68ff7202f..94fca4b9471 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -907,10 +907,6 @@ struct GTY (()) aarch64_frame
> + It is zero when no push is used. */
> + HOST_WIDE_INT callee_adjust;
> +
> +- /* The offset from SP to the callee-save registers after
> initial_adjust.
> +- It may be non-zero if no push is used (ie. callee_adjust == 0). */
> +- poly_int64 callee_offset;
> +-
> + /* The size of the stack adjustment before saving or after restoring
> + SVE registers. */
> + poly_int64 sve_callee_adjust;
> +--
> +2.34.1
> +
> +
> +From 2b983f9064d808daf909bde1d4a13980934a7e6e Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:51 +0100
> +Subject: [PATCH 07/19] aarch64: Only calculate chain_offset if there is a
> + chain
> +
> +After previous patches, it is no longer necessary to calculate
> +a chain_offset in cases where there is no chain record.
> +
> +gcc/
> + * config/aarch64/aarch64.cc (aarch64_expand_prologue): Move the
> + calculation of chain_offset into the emit_frame_chain block.
> +---
> + gcc/config/aarch64/aarch64.cc | 10 +++++-----
> + 1 file changed, 5 insertions(+), 5 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index e79551af41d..d71a042d611 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -9747,16 +9747,16 @@ aarch64_expand_prologue (void)
> + if (callee_adjust != 0)
> + aarch64_push_regs (reg1, reg2, callee_adjust);
> +
> +- /* The offset of the frame chain record (if any) from the current SP.
> */
> +- poly_int64 chain_offset = (initial_adjust + callee_adjust
> +- - frame.hard_fp_offset);
> +- gcc_assert (known_ge (chain_offset, 0));
> +-
> + /* The offset of the current SP from the bottom of the static frame.
> */
> + poly_int64 bytes_below_sp = frame_size - initial_adjust -
> callee_adjust;
> +
> + if (emit_frame_chain)
> + {
> ++ /* The offset of the frame chain record (if any) from the current
> SP. */
> ++ poly_int64 chain_offset = (initial_adjust + callee_adjust
> ++ - frame.hard_fp_offset);
> ++ gcc_assert (known_ge (chain_offset, 0));
> ++
> + if (callee_adjust == 0)
> + {
> + reg1 = R29_REGNUM;
> +--
> +2.34.1
> +
> +
> +From 0a0a824808d1dec51004fb5805c1a0ae2a35433f Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:51 +0100
> +Subject: [PATCH 08/19] aarch64: Rename locals_offset to bytes_above_locals
> +MIME-Version: 1.0
> +Content-Type: text/plain; charset=UTF-8
> +Content-Transfer-Encoding: 8bit
> +
> +locals_offset was described as:
> +
> + /* Offset from the base of the frame (incomming SP) to the
> + top of the locals area. This value is always a multiple of
> + STACK_BOUNDARY. */
> +
> +This is implicitly an “upside down” view of the frame: the incoming
> +SP is at offset 0, and anything N bytes below the incoming SP is at
> +offset N (rather than -N).
> +
> +However, reg_offset instead uses a “right way up” view; that is,
> +it views offsets in address terms. Something above X is at a
> +positive offset from X and something below X is at a negative
> +offset from X.
> +
> +Also, even on FRAME_GROWS_DOWNWARD targets like AArch64,
> +target-independent code views offsets in address terms too:
> +locals are allocated at negative offsets to virtual_stack_vars.
> +
> +It seems confusing to have *_offset fields of the same structure
> +using different polarities like this. This patch tries to avoid
> +that by renaming locals_offset to bytes_above_locals.
> +
> +gcc/
> + * config/aarch64/aarch64.h (aarch64_frame::locals_offset): Rename
> to...
> + (aarch64_frame::bytes_above_locals): ...this.
> + * config/aarch64/aarch64.cc (aarch64_layout_frame)
> + (aarch64_initial_elimination_offset): Update accordingly.
> +---
> + gcc/config/aarch64/aarch64.cc | 6 +++---
> + gcc/config/aarch64/aarch64.h | 6 +++---
> + 2 files changed, 6 insertions(+), 6 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index d71a042d611..d4ec352ba98 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8337,7 +8337,7 @@ aarch64_layout_frame (void)
> + STACK_BOUNDARY / BITS_PER_UNIT));
> + frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
> +
> +- frame.locals_offset = frame.saved_varargs_size;
> ++ frame.bytes_above_locals = frame.saved_varargs_size;
> +
> + frame.initial_adjust = 0;
> + frame.final_adjust = 0;
> +@@ -12578,13 +12578,13 @@ aarch64_initial_elimination_offset (unsigned
> from, unsigned to)
> + return frame.hard_fp_offset;
> +
> + if (from == FRAME_POINTER_REGNUM)
> +- return frame.hard_fp_offset - frame.locals_offset;
> ++ return frame.hard_fp_offset - frame.bytes_above_locals;
> + }
> +
> + if (to == STACK_POINTER_REGNUM)
> + {
> + if (from == FRAME_POINTER_REGNUM)
> +- return frame.frame_size - frame.locals_offset;
> ++ return frame.frame_size - frame.bytes_above_locals;
> + }
> +
> + return frame.frame_size;
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index 94fca4b9471..bf46e6124aa 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -885,10 +885,10 @@ struct GTY (()) aarch64_frame
> + always a multiple of STACK_BOUNDARY. */
> + poly_int64 bytes_below_hard_fp;
> +
> +- /* Offset from the base of the frame (incomming SP) to the
> +- top of the locals area. This value is always a multiple of
> ++ /* The number of bytes between the top of the locals area and the top
> ++ of the frame (the incomming SP). This value is always a multiple of
> + STACK_BOUNDARY. */
> +- poly_int64 locals_offset;
> ++ poly_int64 bytes_above_locals;
> +
> + /* Offset from the base of the frame (incomming SP) to the
> + hard_frame_pointer. This value is always a multiple of
> +--
> +2.34.1
> +
> +
> +From 3fbf0789202b30a67b12e1fb785c7130f098d665 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:52 +0100
> +Subject: [PATCH 09/19] aarch64: Rename hard_fp_offset to
> bytes_above_hard_fp
> +MIME-Version: 1.0
> +Content-Type: text/plain; charset=UTF-8
> +Content-Transfer-Encoding: 8bit
> +
> +Similarly to the previous locals_offset patch, hard_fp_offset
> +was described as:
> +
> + /* Offset from the base of the frame (incomming SP) to the
> + hard_frame_pointer. This value is always a multiple of
> + STACK_BOUNDARY. */
> + poly_int64 hard_fp_offset;
> +
> +which again took an “upside-down” view: higher offsets meant lower
> +addresses. This patch renames the field to bytes_above_hard_fp instead.
> +
> +gcc/
> + * config/aarch64/aarch64.h (aarch64_frame::hard_fp_offset): Rename
> + to...
> + (aarch64_frame::bytes_above_hard_fp): ...this.
> + * config/aarch64/aarch64.cc (aarch64_layout_frame)
> + (aarch64_expand_prologue): Update accordingly.
> + (aarch64_initial_elimination_offset): Likewise.
> +---
> + gcc/config/aarch64/aarch64.cc | 26 +++++++++++++-------------
> + gcc/config/aarch64/aarch64.h | 6 +++---
> + 2 files changed, 16 insertions(+), 16 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index d4ec352ba98..3c4052740e7 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8329,7 +8329,7 @@ aarch64_layout_frame (void)
> + + get_frame_size (),
> + STACK_BOUNDARY / BITS_PER_UNIT);
> +
> +- frame.hard_fp_offset
> ++ frame.bytes_above_hard_fp
> + = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
> +
> + /* Both these values are already aligned. */
> +@@ -8378,13 +8378,13 @@ aarch64_layout_frame (void)
> + else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
> + max_push_offset = 256;
> +
> +- HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset;
> ++ HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
> + HOST_WIDE_INT const_saved_regs_size;
> + if (known_eq (frame.saved_regs_size, 0))
> + frame.initial_adjust = frame.frame_size;
> + else if (frame.frame_size.is_constant (&const_size)
> + && const_size < max_push_offset
> +- && known_eq (frame.hard_fp_offset, const_size))
> ++ && known_eq (frame.bytes_above_hard_fp, const_size))
> + {
> + /* Simple, small frame with no data below the saved registers.
> +
> +@@ -8401,8 +8401,8 @@ aarch64_layout_frame (void)
> + case that it hardly seems worth the effort though. */
> + && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
> + && !(cfun->calls_alloca
> +- && frame.hard_fp_offset.is_constant (&const_fp_offset)
> +- && const_fp_offset < max_push_offset))
> ++ && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
> ++ && const_above_fp < max_push_offset))
> + {
> + /* Frame with small area below the saved registers:
> +
> +@@ -8420,12 +8420,12 @@ aarch64_layout_frame (void)
> + sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
> + save SVE registers relative to SP
> + sub sp, sp, bytes_below_saved_regs */
> +- frame.initial_adjust = (frame.hard_fp_offset
> ++ frame.initial_adjust = (frame.bytes_above_hard_fp
> + + frame.below_hard_fp_saved_regs_size);
> + frame.final_adjust = frame.bytes_below_saved_regs;
> + }
> +- else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
> +- && const_fp_offset < max_push_offset)
> ++ else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp)
> ++ && const_above_fp < max_push_offset)
> + {
> + /* Frame with large area below the saved registers, or with SVE
> saves,
> + but with a small area above:
> +@@ -8435,7 +8435,7 @@ aarch64_layout_frame (void)
> + [sub sp, sp, below_hard_fp_saved_regs_size]
> + [save SVE registers relative to SP]
> + sub sp, sp, bytes_below_saved_regs */
> +- frame.callee_adjust = const_fp_offset;
> ++ frame.callee_adjust = const_above_fp;
> + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
> + frame.final_adjust = frame.bytes_below_saved_regs;
> + }
> +@@ -8450,7 +8450,7 @@ aarch64_layout_frame (void)
> + [sub sp, sp, below_hard_fp_saved_regs_size]
> + [save SVE registers relative to SP]
> + sub sp, sp, bytes_below_saved_regs */
> +- frame.initial_adjust = frame.hard_fp_offset;
> ++ frame.initial_adjust = frame.bytes_above_hard_fp;
> + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
> + frame.final_adjust = frame.bytes_below_saved_regs;
> + }
> +@@ -9754,7 +9754,7 @@ aarch64_expand_prologue (void)
> + {
> + /* The offset of the frame chain record (if any) from the current
> SP. */
> + poly_int64 chain_offset = (initial_adjust + callee_adjust
> +- - frame.hard_fp_offset);
> ++ - frame.bytes_above_hard_fp);
> + gcc_assert (known_ge (chain_offset, 0));
> +
> + if (callee_adjust == 0)
> +@@ -12575,10 +12575,10 @@ aarch64_initial_elimination_offset (unsigned
> from, unsigned to)
> + if (to == HARD_FRAME_POINTER_REGNUM)
> + {
> + if (from == ARG_POINTER_REGNUM)
> +- return frame.hard_fp_offset;
> ++ return frame.bytes_above_hard_fp;
> +
> + if (from == FRAME_POINTER_REGNUM)
> +- return frame.hard_fp_offset - frame.bytes_above_locals;
> ++ return frame.bytes_above_hard_fp - frame.bytes_above_locals;
> + }
> +
> + if (to == STACK_POINTER_REGNUM)
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index bf46e6124aa..dd1f403f939 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -890,10 +890,10 @@ struct GTY (()) aarch64_frame
> + STACK_BOUNDARY. */
> + poly_int64 bytes_above_locals;
> +
> +- /* Offset from the base of the frame (incomming SP) to the
> +- hard_frame_pointer. This value is always a multiple of
> ++ /* The number of bytes between the hard_frame_pointer and the top of
> ++ the frame (the incomming SP). This value is always a multiple of
> + STACK_BOUNDARY. */
> +- poly_int64 hard_fp_offset;
> ++ poly_int64 bytes_above_hard_fp;
> +
> + /* The size of the frame. This value is the offset from base of the
> + frame (incomming SP) to the stack_pointer. This value is always
> +--
> +2.34.1
> +
> +
> +From aac8b31379ac3bbd14fc6427dce23f56e54e8485 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:52 +0100
> +Subject: [PATCH 10/19] aarch64: Tweak frame_size comment
> +MIME-Version: 1.0
> +Content-Type: text/plain; charset=UTF-8
> +Content-Transfer-Encoding: 8bit
> +
> +This patch fixes another case in which a value was described with
> +an “upside-down” view.
> +
> +gcc/
> + * config/aarch64/aarch64.h (aarch64_frame::frame_size): Tweak
> comment.
> +---
> + gcc/config/aarch64/aarch64.h | 4 ++--
> + 1 file changed, 2 insertions(+), 2 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index dd1f403f939..700524ae22b 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -895,8 +895,8 @@ struct GTY (()) aarch64_frame
> + STACK_BOUNDARY. */
> + poly_int64 bytes_above_hard_fp;
> +
> +- /* The size of the frame. This value is the offset from base of the
> +- frame (incomming SP) to the stack_pointer. This value is always
> ++ /* The size of the frame, i.e. the number of bytes between the bottom
> ++ of the outgoing arguments and the incoming SP. This value is always
> + a multiple of STACK_BOUNDARY. */
> + poly_int64 frame_size;
> +
> +--
> +2.34.1
> +
> +
> +From 8d5506a8aeb8dd7e8b209a3663b07688478f76b9 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:53 +0100
> +Subject: [PATCH 11/19] aarch64: Measure reg_offset from the bottom of the
> + frame
> +
> +reg_offset was measured from the bottom of the saved register area.
> +This made perfect sense with the original layout, since the bottom
> +of the saved register area was also the hard frame pointer address.
> +It became slightly less obvious with SVE, since we save SVE
> +registers below the hard frame pointer, but it still made sense.
> +
> +However, if we want to allow different frame layouts, it's more
> +convenient and obvious to measure reg_offset from the bottom of
> +the frame. After previous patches, it's also a slight simplification
> +in its own right.
> +
> +gcc/
> + * config/aarch64/aarch64.h (aarch64_frame): Add comment above
> + reg_offset.
> + * config/aarch64/aarch64.cc (aarch64_layout_frame): Walk offsets
> + from the bottom of the frame, rather than the bottom of the saved
> + register area. Measure reg_offset from the bottom of the frame
> + rather than the bottom of the saved register area.
> + (aarch64_save_callee_saves): Update accordingly.
> + (aarch64_restore_callee_saves): Likewise.
> + (aarch64_get_separate_components): Likewise.
> + (aarch64_process_components): Likewise.
> +---
> + gcc/config/aarch64/aarch64.cc | 53 ++++++++++++++++-------------------
> + gcc/config/aarch64/aarch64.h | 3 ++
> + 2 files changed, 27 insertions(+), 29 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 3c4052740e7..97dd077844b 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8139,7 +8139,6 @@ aarch64_needs_frame_chain (void)
> + static void
> + aarch64_layout_frame (void)
> + {
> +- poly_int64 offset = 0;
> + int regno, last_fp_reg = INVALID_REGNUM;
> + machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
> + poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
> +@@ -8217,7 +8216,9 @@ aarch64_layout_frame (void)
> + gcc_assert (crtl->is_leaf
> + || maybe_ne (frame.reg_offset[R30_REGNUM],
> SLOT_NOT_REQUIRED));
> +
> +- frame.bytes_below_saved_regs = crtl->outgoing_args_size;
> ++ poly_int64 offset = crtl->outgoing_args_size;
> ++ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
> ++ frame.bytes_below_saved_regs = offset;
> +
> + /* Now assign stack slots for the registers. Start with the predicate
> + registers, since predicate LDR and STR have a relatively small
> +@@ -8229,7 +8230,8 @@ aarch64_layout_frame (void)
> + offset += BYTES_PER_SVE_PRED;
> + }
> +
> +- if (maybe_ne (offset, 0))
> ++ poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
> ++ if (maybe_ne (saved_prs_size, 0))
> + {
> + /* If we have any vector registers to save above the predicate
> registers,
> + the offset of the vector register save slots need to be a multiple
> +@@ -8247,10 +8249,10 @@ aarch64_layout_frame (void)
> + offset = aligned_upper_bound (offset, STACK_BOUNDARY /
> BITS_PER_UNIT);
> + else
> + {
> +- if (known_le (offset, vector_save_size))
> +- offset = vector_save_size;
> +- else if (known_le (offset, vector_save_size * 2))
> +- offset = vector_save_size * 2;
> ++ if (known_le (saved_prs_size, vector_save_size))
> ++ offset = frame.bytes_below_saved_regs + vector_save_size;
> ++ else if (known_le (saved_prs_size, vector_save_size * 2))
> ++ offset = frame.bytes_below_saved_regs + vector_save_size * 2;
> + else
> + gcc_unreachable ();
> + }
> +@@ -8267,9 +8269,10 @@ aarch64_layout_frame (void)
> +
> + /* OFFSET is now the offset of the hard frame pointer from the bottom
> + of the callee save area. */
> +- bool saves_below_hard_fp_p = maybe_ne (offset, 0);
> +- frame.below_hard_fp_saved_regs_size = offset;
> +- frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs;
> ++ frame.below_hard_fp_saved_regs_size = offset -
> frame.bytes_below_saved_regs;
> ++ bool saves_below_hard_fp_p
> ++ = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
> ++ frame.bytes_below_hard_fp = offset;
> + if (frame.emit_frame_chain)
> + {
> + /* FP and LR are placed in the linkage record. */
> +@@ -8320,9 +8323,10 @@ aarch64_layout_frame (void)
> +
> + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> +
> +- frame.saved_regs_size = offset;
> ++ frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
> +
> +- poly_int64 varargs_and_saved_regs_size = offset +
> frame.saved_varargs_size;
> ++ poly_int64 varargs_and_saved_regs_size
> ++ = frame.saved_regs_size + frame.saved_varargs_size;
> +
> + poly_int64 saved_regs_and_above
> + = aligned_upper_bound (varargs_and_saved_regs_size
> +@@ -8790,9 +8794,7 @@ aarch64_save_callee_saves (poly_int64
> bytes_below_sp,
> +
> + machine_mode mode = aarch64_reg_save_mode (regno);
> + reg = gen_rtx_REG (mode, regno);
> +- offset = (frame.reg_offset[regno]
> +- + frame.bytes_below_saved_regs
> +- - bytes_below_sp);
> ++ offset = frame.reg_offset[regno] - bytes_below_sp;
> + rtx base_rtx = stack_pointer_rtx;
> + poly_int64 sp_offset = offset;
> +
> +@@ -8899,9 +8901,7 @@ aarch64_restore_callee_saves (poly_int64
> bytes_below_sp, unsigned start,
> +
> + machine_mode mode = aarch64_reg_save_mode (regno);
> + reg = gen_rtx_REG (mode, regno);
> +- offset = (frame.reg_offset[regno]
> +- + frame.bytes_below_saved_regs
> +- - bytes_below_sp);
> ++ offset = frame.reg_offset[regno] - bytes_below_sp;
> + rtx base_rtx = stack_pointer_rtx;
> + if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
> + aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
> +@@ -9040,14 +9040,12 @@ aarch64_get_separate_components (void)
> + it as a stack probe for -fstack-clash-protection. */
> + if (flag_stack_clash_protection
> + && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
> +- && known_eq (offset, 0))
> ++ && known_eq (offset, frame.bytes_below_saved_regs))
> + continue;
> +
> + /* Get the offset relative to the register we'll use. */
> + if (frame_pointer_needed)
> +- offset -= frame.below_hard_fp_saved_regs_size;
> +- else
> +- offset += frame.bytes_below_saved_regs;
> ++ offset -= frame.bytes_below_hard_fp;
> +
> + /* Check that we can access the stack slot of the register with one
> + direct load with no adjustments needed. */
> +@@ -9194,9 +9192,7 @@ aarch64_process_components (sbitmap components,
> bool prologue_p)
> + rtx reg = gen_rtx_REG (mode, regno);
> + poly_int64 offset = frame.reg_offset[regno];
> + if (frame_pointer_needed)
> +- offset -= frame.below_hard_fp_saved_regs_size;
> +- else
> +- offset += frame.bytes_below_saved_regs;
> ++ offset -= frame.bytes_below_hard_fp;
> +
> + rtx addr = plus_constant (Pmode, ptr_reg, offset);
> + rtx mem = gen_frame_mem (mode, addr);
> +@@ -9248,9 +9244,7 @@ aarch64_process_components (sbitmap components,
> bool prologue_p)
> + /* REGNO2 can be saved/restored in a pair with REGNO. */
> + rtx reg2 = gen_rtx_REG (mode, regno2);
> + if (frame_pointer_needed)
> +- offset2 -= frame.below_hard_fp_saved_regs_size;
> +- else
> +- offset2 += frame.bytes_below_saved_regs;
> ++ offset2 -= frame.bytes_below_hard_fp;
> + rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
> + rtx mem2 = gen_frame_mem (mode, addr2);
> + rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
> +@@ -9366,7 +9360,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1,
> rtx temp2,
> + if (final_adjustment_p
> + && known_eq (frame.below_hard_fp_saved_regs_size, 0))
> + {
> +- poly_int64 lr_offset = frame.reg_offset[LR_REGNUM];
> ++ poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM]
> ++ - frame.bytes_below_saved_regs);
> + if (known_ge (lr_offset, 0))
> + min_probe_threshold -= lr_offset.to_constant ();
> + else
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index 700524ae22b..b6135837073 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -860,6 +860,9 @@ extern enum aarch64_processor aarch64_tune;
> + #ifdef HAVE_POLY_INT_H
> + struct GTY (()) aarch64_frame
> + {
> ++ /* The offset from the bottom of the static frame (the bottom of the
> ++ outgoing arguments) of each register save slot, or -2 if no save is
> ++ needed. */
> + poly_int64 reg_offset[LAST_SAVED_REGNUM + 1];
> +
> + /* The number of extra stack bytes taken up by register varargs.
> +--
> +2.34.1
> +
> +
> +From b47766614df3b9df878262efb2ad73aaac108363 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:53 +0100
> +Subject: [PATCH 12/19] aarch64: Simplify top of frame allocation
> +
> +After previous patches, it no longer really makes sense to allocate
> +the top of the frame in terms of varargs_and_saved_regs_size and
> +saved_regs_and_above.
> +
> +gcc/
> + * config/aarch64/aarch64.cc (aarch64_layout_frame): Simplify
> + the allocation of the top of the frame.
> +---
> + gcc/config/aarch64/aarch64.cc | 23 ++++++++---------------
> + 1 file changed, 8 insertions(+), 15 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 97dd077844b..81935852d5b 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8325,23 +8325,16 @@ aarch64_layout_frame (void)
> +
> + frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
> +
> +- poly_int64 varargs_and_saved_regs_size
> +- = frame.saved_regs_size + frame.saved_varargs_size;
> +-
> +- poly_int64 saved_regs_and_above
> +- = aligned_upper_bound (varargs_and_saved_regs_size
> +- + get_frame_size (),
> +- STACK_BOUNDARY / BITS_PER_UNIT);
> +-
> +- frame.bytes_above_hard_fp
> +- = saved_regs_and_above - frame.below_hard_fp_saved_regs_size;
> ++ offset += get_frame_size ();
> ++ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> ++ auto top_of_locals = offset;
> +
> +- /* Both these values are already aligned. */
> +- gcc_assert (multiple_p (frame.bytes_below_saved_regs,
> +- STACK_BOUNDARY / BITS_PER_UNIT));
> +- frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs;
> ++ offset += frame.saved_varargs_size;
> ++ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
> ++ frame.frame_size = offset;
> +
> +- frame.bytes_above_locals = frame.saved_varargs_size;
> ++ frame.bytes_above_hard_fp = frame.frame_size -
> frame.bytes_below_hard_fp;
> ++ frame.bytes_above_locals = frame.frame_size - top_of_locals;
> +
> + frame.initial_adjust = 0;
> + frame.final_adjust = 0;
> +--
> +2.34.1
> +
> +
> +From 08f71b4bb28fb74d20e8d2927a557e8119ce9f4d Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:54 +0100
> +Subject: [PATCH 13/19] aarch64: Minor initial adjustment tweak
> +
> +This patch just changes a calculation of initial_adjust
> +to one that makes it slightly more obvious that the total
> +adjustment is frame.frame_size.
> +
> +gcc/
> + * config/aarch64/aarch64.cc (aarch64_layout_frame): Tweak
> + calculation of initial_adjust for frames in which all saves
> + are SVE saves.
> +---
> + gcc/config/aarch64/aarch64.cc | 5 ++---
> + 1 file changed, 2 insertions(+), 3 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 81935852d5b..4d9fcf3d162 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8414,11 +8414,10 @@ aarch64_layout_frame (void)
> + {
> + /* Frame in which all saves are SVE saves:
> +
> +- sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
> ++ sub sp, sp, frame_size - bytes_below_saved_regs
> + save SVE registers relative to SP
> + sub sp, sp, bytes_below_saved_regs */
> +- frame.initial_adjust = (frame.bytes_above_hard_fp
> +- + frame.below_hard_fp_saved_regs_size);
> ++ frame.initial_adjust = frame.frame_size -
> frame.bytes_below_saved_regs;
> + frame.final_adjust = frame.bytes_below_saved_regs;
> + }
> + else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp)
> +--
> +2.34.1
> +
> +
> +From f22315d5c19e8310e4dc880fd509678fd291fca8 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:54 +0100
> +Subject: [PATCH 14/19] aarch64: Tweak stack clash boundary condition
> +
> +The AArch64 ABI says that, when stack clash protection is used,
> +there can be a maximum of 1KiB of unprobed space at sp on entry
> +to a function. Therefore, we need to probe when allocating
> +>= guard_size - 1KiB of data (>= rather than >). This is what
> +GCC does.
> +
> +If an allocation is exactly guard_size bytes, it is enough to allocate
> +those bytes and probe once at offset 1024. It isn't possible to use a
> +single probe at any other offset: higher would conmplicate later code,
> +by leaving more unprobed space than usual, while lower would risk
> +leaving an entire page unprobed. For simplicity, the code probes all
> +allocations at offset 1024.
> +
> +Some register saves also act as probes. If we need to allocate
> +more space below the last such register save probe, we need to
> +probe the allocation if it is > 1KiB. Again, this allocation is
> +then sometimes (but not always) probed at offset 1024. This sort of
> +allocation is currently only used for outgoing arguments, which are
> +rarely this big.
> +
> +However, the code also probed if this final outgoing-arguments
> +allocation was == 1KiB, rather than just > 1KiB. This isn't
> +necessary, since the register save then probes at offset 1024
> +as required. Continuing to probe allocations of exactly 1KiB
> +would complicate later patches.
> +
> +gcc/
> + * config/aarch64/aarch64.cc
> (aarch64_allocate_and_probe_stack_space):
> + Don't probe final allocations that are exactly 1KiB in size (after
> + unprobed space above the final allocation has been deducted).
> +
> +gcc/testsuite/
> + * gcc.target/aarch64/stack-check-prologue-17.c: New test.
> +---
> + gcc/config/aarch64/aarch64.cc | 4 +-
> + .../aarch64/stack-check-prologue-17.c | 55 +++++++++++++++++++
> + 2 files changed, 58 insertions(+), 1 deletion(-)
> + create mode 100644
> gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 4d9fcf3d162..34c1d8614cd 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -9333,9 +9333,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1,
> rtx temp2,
> + HOST_WIDE_INT guard_size
> + = 1 << param_stack_clash_protection_guard_size;
> + HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
> ++ HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
> ++ gcc_assert (multiple_p (poly_size, byte_sp_alignment));
> + HOST_WIDE_INT min_probe_threshold
> + = (final_adjustment_p
> +- ? guard_used_by_caller
> ++ ? guard_used_by_caller + byte_sp_alignment
> + : guard_size - guard_used_by_caller);
> + /* When doing the final adjustment for the outgoing arguments, take
> into
> + account any unprobed space there is above the current SP. There are
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> +new file mode 100644
> +index 00000000000..0d8a25d73a2
> +--- /dev/null
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> +@@ -0,0 +1,55 @@
> ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer
> --param stack-clash-protection-guard-size=12" } */
> ++/* { dg-final { check-function-bodies "**" "" } } */
> ++
> ++void f(int, ...);
> ++void g();
> ++
> ++/*
> ++** test1:
> ++** ...
> ++** str x30, \[sp\]
> ++** sub sp, sp, #1024
> ++** cbnz w0, .*
> ++** bl g
> ++** ...
> ++*/
> ++int test1(int z) {
> ++ __uint128_t x = 0;
> ++ int y[0x400];
> ++ if (z)
> ++ {
> ++ f(0, 0, 0, 0, 0, 0, 0, &y,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
> ++ }
> ++ g();
> ++ return 1;
> ++}
> ++
> ++/*
> ++** test2:
> ++** ...
> ++** str x30, \[sp\]
> ++** sub sp, sp, #1040
> ++** str xzr, \[sp\]
> ++** cbnz w0, .*
> ++** bl g
> ++** ...
> ++*/
> ++int test2(int z) {
> ++ __uint128_t x = 0;
> ++ int y[0x400];
> ++ if (z)
> ++ {
> ++ f(0, 0, 0, 0, 0, 0, 0, &y,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x);
> ++ }
> ++ g();
> ++ return 1;
> ++}
> +--
> +2.34.1
> +
> +
> +From 15e18831bf98fd25af098b970ebf0c9a6200a34b Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:55 +0100
> +Subject: [PATCH 15/19] aarch64: Put LR save probe in first 16 bytes
> +
> +-fstack-clash-protection uses the save of LR as a probe for the next
> +allocation. The next allocation could be:
> +
> +* another part of the static frame, e.g. when allocating SVE save slots
> + or outgoing arguments
> +
> +* an alloca in the same function
> +
> +* an allocation made by a callee function
> +
> +However, when -fomit-frame-pointer is used, the LR save slot is placed
> +above the other GPR save slots. It could therefore be up to 80 bytes
> +above the base of the GPR save area (which is also the hard fp address).
> +
> +aarch64_allocate_and_probe_stack_space took this into account when
> +deciding how much subsequent space could be allocated without needing
> +a probe. However, it interacted badly with:
> +
> + /* If doing a small final adjustment, we always probe at offset 0.
> + This is done to avoid issues when LR is not at position 0 or when
> + the final adjustment is smaller than the probing offset. */
> + else if (final_adjustment_p && rounded_size == 0)
> + residual_probe_offset = 0;
> +
> +which forces any allocation that is smaller than the guard page size
> +to be probed at offset 0 rather than the usual offset 1024. It was
> +therefore possible to construct cases in which we had:
> +
> +* a probe using LR at SP + 80 bytes (or some other value >= 16)
> +* an allocation of the guard page size - 16 bytes
> +* a probe at SP + 0
> +
> +which allocates guard page size + 64 consecutive unprobed bytes.
> +
> +This patch requires the LR probe to be in the first 16 bytes of the
> +save area when stack clash protection is active. Doing it
> +unconditionally would cause code-quality regressions.
> +
> +Putting LR before other registers prevents push/pop allocation
> +when shadow call stacks are enabled, since LR is restored
> +separately from the other callee-saved registers.
> +
> +The new comment doesn't say that the probe register is required
> +to be LR, since a later patch removes that restriction.
> +
> +gcc/
> + * config/aarch64/aarch64.cc (aarch64_layout_frame): Ensure that
> + the LR save slot is in the first 16 bytes of the register save
> area.
> + Only form STP/LDP push/pop candidates if both registers are valid.
> + (aarch64_allocate_and_probe_stack_space): Remove workaround for
> + when LR was not in the first 16 bytes.
> +
> +gcc/testsuite/
> + * gcc.target/aarch64/stack-check-prologue-18.c: New test.
> + * gcc.target/aarch64/stack-check-prologue-19.c: Likewise.
> + * gcc.target/aarch64/stack-check-prologue-20.c: Likewise.
> +---
> + gcc/config/aarch64/aarch64.cc | 72 ++++++-------
> + .../aarch64/stack-check-prologue-18.c | 100 ++++++++++++++++++
> + .../aarch64/stack-check-prologue-19.c | 100 ++++++++++++++++++
> + .../aarch64/stack-check-prologue-20.c | 3 +
> + 4 files changed, 233 insertions(+), 42 deletions(-)
> + create mode 100644
> gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> + create mode 100644
> gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> + create mode 100644
> gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 34c1d8614cd..16433fb70f4 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8273,26 +8273,34 @@ aarch64_layout_frame (void)
> + bool saves_below_hard_fp_p
> + = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
> + frame.bytes_below_hard_fp = offset;
> ++
> ++ auto allocate_gpr_slot = [&](unsigned int regno)
> ++ {
> ++ frame.reg_offset[regno] = offset;
> ++ if (frame.wb_push_candidate1 == INVALID_REGNUM)
> ++ frame.wb_push_candidate1 = regno;
> ++ else if (frame.wb_push_candidate2 == INVALID_REGNUM)
> ++ frame.wb_push_candidate2 = regno;
> ++ offset += UNITS_PER_WORD;
> ++ };
> ++
> + if (frame.emit_frame_chain)
> + {
> + /* FP and LR are placed in the linkage record. */
> +- frame.reg_offset[R29_REGNUM] = offset;
> +- frame.wb_push_candidate1 = R29_REGNUM;
> +- frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
> +- frame.wb_push_candidate2 = R30_REGNUM;
> +- offset += 2 * UNITS_PER_WORD;
> ++ allocate_gpr_slot (R29_REGNUM);
> ++ allocate_gpr_slot (R30_REGNUM);
> + }
> ++ else if (flag_stack_clash_protection
> ++ && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
> ++ /* Put the LR save slot first, since it makes a good choice of probe
> ++ for stack clash purposes. The idea is that the link register
> usually
> ++ has to be saved before a call anyway, and so we lose little by
> ++ stopping it from being individually shrink-wrapped. */
> ++ allocate_gpr_slot (R30_REGNUM);
> +
> + for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
> + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
> +- {
> +- frame.reg_offset[regno] = offset;
> +- if (frame.wb_push_candidate1 == INVALID_REGNUM)
> +- frame.wb_push_candidate1 = regno;
> +- else if (frame.wb_push_candidate2 == INVALID_REGNUM)
> +- frame.wb_push_candidate2 = regno;
> +- offset += UNITS_PER_WORD;
> +- }
> ++ allocate_gpr_slot (regno);
> +
> + poly_int64 max_int_offset = offset;
> + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> +@@ -8370,10 +8378,13 @@ aarch64_layout_frame (void)
> + max_push_offset to 0, because no registers are popped at this time,
> + so callee_adjust cannot be adjusted. */
> + HOST_WIDE_INT max_push_offset = 0;
> +- if (frame.wb_pop_candidate2 != INVALID_REGNUM)
> +- max_push_offset = 512;
> +- else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
> +- max_push_offset = 256;
> ++ if (frame.wb_pop_candidate1 != INVALID_REGNUM)
> ++ {
> ++ if (frame.wb_pop_candidate2 != INVALID_REGNUM)
> ++ max_push_offset = 512;
> ++ else
> ++ max_push_offset = 256;
> ++ }
> +
> + HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
> + HOST_WIDE_INT const_saved_regs_size;
> +@@ -9339,29 +9350,6 @@ aarch64_allocate_and_probe_stack_space (rtx temp1,
> rtx temp2,
> + = (final_adjustment_p
> + ? guard_used_by_caller + byte_sp_alignment
> + : guard_size - guard_used_by_caller);
> +- /* When doing the final adjustment for the outgoing arguments, take
> into
> +- account any unprobed space there is above the current SP. There are
> +- two cases:
> +-
> +- - When saving SVE registers below the hard frame pointer, we force
> +- the lowest save to take place in the prologue before doing the
> final
> +- adjustment (i.e. we don't allow the save to be shrink-wrapped).
> +- This acts as a probe at SP, so there is no unprobed space.
> +-
> +- - When there are no SVE register saves, we use the store of the link
> +- register as a probe. We can't assume that LR was saved at
> position 0
> +- though, so treat any space below it as unprobed. */
> +- if (final_adjustment_p
> +- && known_eq (frame.below_hard_fp_saved_regs_size, 0))
> +- {
> +- poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM]
> +- - frame.bytes_below_saved_regs);
> +- if (known_ge (lr_offset, 0))
> +- min_probe_threshold -= lr_offset.to_constant ();
> +- else
> +- gcc_assert (!flag_stack_clash_protection || known_eq (poly_size,
> 0));
> +- }
> +-
> + poly_int64 frame_size = frame.frame_size;
> +
> + /* We should always have a positive probe threshold. */
> +@@ -9541,8 +9529,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1,
> rtx temp2,
> + if (final_adjustment_p && rounded_size != 0)
> + min_probe_threshold = 0;
> + /* If doing a small final adjustment, we always probe at offset 0.
> +- This is done to avoid issues when LR is not at position 0 or when
> +- the final adjustment is smaller than the probing offset. */
> ++ This is done to avoid issues when the final adjustment is smaller
> ++ than the probing offset. */
> + else if (final_adjustment_p && rounded_size == 0)
> + residual_probe_offset = 0;
> +
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> +new file mode 100644
> +index 00000000000..82447d20fff
> +--- /dev/null
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> +@@ -0,0 +1,100 @@
> ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer
> --param stack-clash-protection-guard-size=12" } */
> ++/* { dg-final { check-function-bodies "**" "" } } */
> ++
> ++void f(int, ...);
> ++void g();
> ++
> ++/*
> ++** test1:
> ++** ...
> ++** str x30, \[sp\]
> ++** sub sp, sp, #4064
> ++** str xzr, \[sp\]
> ++** cbnz w0, .*
> ++** bl g
> ++** ...
> ++** str x26, \[sp, #?4128\]
> ++** ...
> ++*/
> ++int test1(int z) {
> ++ __uint128_t x = 0;
> ++ int y[0x400];
> ++ if (z)
> ++ {
> ++ asm volatile ("" :::
> ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25",
> "x26");
> ++ f(0, 0, 0, 0, 0, 0, 0, &y,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x);
> ++ }
> ++ g();
> ++ return 1;
> ++}
> ++
> ++/*
> ++** test2:
> ++** ...
> ++** str x30, \[sp\]
> ++** sub sp, sp, #1040
> ++** str xzr, \[sp\]
> ++** cbnz w0, .*
> ++** bl g
> ++** ...
> ++*/
> ++int test2(int z) {
> ++ __uint128_t x = 0;
> ++ int y[0x400];
> ++ if (z)
> ++ {
> ++ asm volatile ("" :::
> ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25",
> "x26");
> ++ f(0, 0, 0, 0, 0, 0, 0, &y,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x);
> ++ }
> ++ g();
> ++ return 1;
> ++}
> ++
> ++/*
> ++** test3:
> ++** ...
> ++** str x30, \[sp\]
> ++** sub sp, sp, #1024
> ++** cbnz w0, .*
> ++** bl g
> ++** ...
> ++*/
> ++int test3(int z) {
> ++ __uint128_t x = 0;
> ++ int y[0x400];
> ++ if (z)
> ++ {
> ++ asm volatile ("" :::
> ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25",
> "x26");
> ++ f(0, 0, 0, 0, 0, 0, 0, &y,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
> ++ }
> ++ g();
> ++ return 1;
> ++}
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> +new file mode 100644
> +index 00000000000..73ac3e4e4eb
> +--- /dev/null
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> +@@ -0,0 +1,100 @@
> ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer
> --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack
> -ffixed-x18" } */
> ++/* { dg-final { check-function-bodies "**" "" } } */
> ++
> ++void f(int, ...);
> ++void g();
> ++
> ++/*
> ++** test1:
> ++** ...
> ++** str x30, \[sp\]
> ++** sub sp, sp, #4064
> ++** str xzr, \[sp\]
> ++** cbnz w0, .*
> ++** bl g
> ++** ...
> ++** str x26, \[sp, #?4128\]
> ++** ...
> ++*/
> ++int test1(int z) {
> ++ __uint128_t x = 0;
> ++ int y[0x400];
> ++ if (z)
> ++ {
> ++ asm volatile ("" :::
> ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25",
> "x26");
> ++ f(0, 0, 0, 0, 0, 0, 0, &y,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x);
> ++ }
> ++ g();
> ++ return 1;
> ++}
> ++
> ++/*
> ++** test2:
> ++** ...
> ++** str x30, \[sp\]
> ++** sub sp, sp, #1040
> ++** str xzr, \[sp\]
> ++** cbnz w0, .*
> ++** bl g
> ++** ...
> ++*/
> ++int test2(int z) {
> ++ __uint128_t x = 0;
> ++ int y[0x400];
> ++ if (z)
> ++ {
> ++ asm volatile ("" :::
> ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25",
> "x26");
> ++ f(0, 0, 0, 0, 0, 0, 0, &y,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x);
> ++ }
> ++ g();
> ++ return 1;
> ++}
> ++
> ++/*
> ++** test3:
> ++** ...
> ++** str x30, \[sp\]
> ++** sub sp, sp, #1024
> ++** cbnz w0, .*
> ++** bl g
> ++** ...
> ++*/
> ++int test3(int z) {
> ++ __uint128_t x = 0;
> ++ int y[0x400];
> ++ if (z)
> ++ {
> ++ asm volatile ("" :::
> ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25",
> "x26");
> ++ f(0, 0, 0, 0, 0, 0, 0, &y,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
> ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x);
> ++ }
> ++ g();
> ++ return 1;
> ++}
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
> +new file mode 100644
> +index 00000000000..690aae8dfd5
> +--- /dev/null
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c
> +@@ -0,0 +1,3 @@
> ++/* { dg-options "-O2 -fstack-protector-all -fstack-clash-protection
> -fomit-frame-pointer --param stack-clash-protection-guard-size=12
> -fsanitize=shadow-call-stack -ffixed-x18" } */
> ++
> ++#include "stack-check-prologue-19.c"
> +--
> +2.34.1
> +
> +
> +From c4f0e121faa36342f1d21919e54a05ad841c4f86 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:55 +0100
> +Subject: [PATCH 16/19] aarch64: Simplify probe of final frame allocation
> +
> +Previous patches ensured that the final frame allocation only needs
> +a probe when the size is strictly greater than 1KiB. It's therefore
> +safe to use the normal 1024 probe offset in all cases.
> +
> +The main motivation for doing this is to simplify the code and
> +remove the number of special cases.
> +
> +gcc/
> + * config/aarch64/aarch64.cc
> (aarch64_allocate_and_probe_stack_space):
> + Always probe the residual allocation at offset 1024, asserting
> + that that is in range.
> +
> +gcc/testsuite/
> + * gcc.target/aarch64/stack-check-prologue-17.c: Expect the probe
> + to be at offset 1024 rather than offset 0.
> + * gcc.target/aarch64/stack-check-prologue-18.c: Likewise.
> + * gcc.target/aarch64/stack-check-prologue-19.c: Likewise.
> +---
> + gcc/config/aarch64/aarch64.cc | 12 ++++--------
> + .../gcc.target/aarch64/stack-check-prologue-17.c | 2 +-
> + .../gcc.target/aarch64/stack-check-prologue-18.c | 4 ++--
> + .../gcc.target/aarch64/stack-check-prologue-19.c | 4 ++--
> + 4 files changed, 9 insertions(+), 13 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 16433fb70f4..8abf3d7a1e2 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -9523,16 +9523,12 @@ aarch64_allocate_and_probe_stack_space (rtx
> temp1, rtx temp2,
> + are still safe. */
> + if (residual)
> + {
> +- HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
> ++ gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
> ++
> + /* If we're doing final adjustments, and we've done any full page
> + allocations then any residual needs to be probed. */
> + if (final_adjustment_p && rounded_size != 0)
> + min_probe_threshold = 0;
> +- /* If doing a small final adjustment, we always probe at offset 0.
> +- This is done to avoid issues when the final adjustment is smaller
> +- than the probing offset. */
> +- else if (final_adjustment_p && rounded_size == 0)
> +- residual_probe_offset = 0;
> +
> + aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
> + if (residual >= min_probe_threshold)
> +@@ -9543,8 +9539,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1,
> rtx temp2,
> + HOST_WIDE_INT_PRINT_DEC " bytes, probing will be
> required."
> + "\n", residual);
> +
> +- emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
> +- residual_probe_offset));
> ++ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
> ++ guard_used_by_caller));
> + emit_insn (gen_blockage ());
> + }
> + }
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> +index 0d8a25d73a2..f0ec1389771 100644
> +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c
> +@@ -33,7 +33,7 @@ int test1(int z) {
> + ** ...
> + ** str x30, \[sp\]
> + ** sub sp, sp, #1040
> +-** str xzr, \[sp\]
> ++** str xzr, \[sp, #?1024\]
> + ** cbnz w0, .*
> + ** bl g
> + ** ...
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> +index 82447d20fff..6383bec5ebc 100644
> +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c
> +@@ -9,7 +9,7 @@ void g();
> + ** ...
> + ** str x30, \[sp\]
> + ** sub sp, sp, #4064
> +-** str xzr, \[sp\]
> ++** str xzr, \[sp, #?1024\]
> + ** cbnz w0, .*
> + ** bl g
> + ** ...
> +@@ -50,7 +50,7 @@ int test1(int z) {
> + ** ...
> + ** str x30, \[sp\]
> + ** sub sp, sp, #1040
> +-** str xzr, \[sp\]
> ++** str xzr, \[sp, #?1024\]
> + ** cbnz w0, .*
> + ** bl g
> + ** ...
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> +index 73ac3e4e4eb..562039b5e9b 100644
> +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c
> +@@ -9,7 +9,7 @@ void g();
> + ** ...
> + ** str x30, \[sp\]
> + ** sub sp, sp, #4064
> +-** str xzr, \[sp\]
> ++** str xzr, \[sp, #?1024\]
> + ** cbnz w0, .*
> + ** bl g
> + ** ...
> +@@ -50,7 +50,7 @@ int test1(int z) {
> + ** ...
> + ** str x30, \[sp\]
> + ** sub sp, sp, #1040
> +-** str xzr, \[sp\]
> ++** str xzr, \[sp, #?1024\]
> + ** cbnz w0, .*
> + ** bl g
> + ** ...
> +--
> +2.34.1
> +
> +
> +From 6f0ab0a9f46a17b68349ff6035aa776bf65f0575 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:56 +0100
> +Subject: [PATCH 17/19] aarch64: Explicitly record probe registers in frame
> + info
> +
> +The stack frame is currently divided into three areas:
> +
> +A: the area above the hard frame pointer
> +B: the SVE saves below the hard frame pointer
> +C: the outgoing arguments
> +
> +If the stack frame is allocated in one chunk, the allocation needs a
> +probe if the frame size is >= guard_size - 1KiB. In addition, if the
> +function is not a leaf function, it must probe an address no more than
> +1KiB above the outgoing SP. We ensured the second condition by
> +
> +(1) using single-chunk allocations for non-leaf functions only if
> + the link register save slot is within 512 bytes of the bottom
> + of the frame; and
> +
> +(2) using the link register save as a probe (meaning, for instance,
> + that it can't be individually shrink wrapped)
> +
> +If instead the stack is allocated in multiple chunks, then:
> +
> +* an allocation involving only the outgoing arguments (C above) requires
> + a probe if the allocation size is > 1KiB
> +
> +* any other allocation requires a probe if the allocation size
> + is >= guard_size - 1KiB
> +
> +* second and subsequent allocations require the previous allocation
> + to probe at the bottom of the allocated area, regardless of the size
> + of that previous allocation
> +
> +The final point means that, unlike for single allocations,
> +it can be necessary to have both a non-SVE register probe and
> +an SVE register probe. For example:
> +
> +* allocate A, probe using a non-SVE register save
> +* allocate B, probe using an SVE register save
> +* allocate C
> +
> +The non-SVE register used in this case was again the link register.
> +It was previously used even if the link register save slot was some
> +bytes above the bottom of the non-SVE register saves, but an earlier
> +patch avoided that by putting the link register save slot first.
> +
> +As a belt-and-braces fix, this patch explicitly records which
> +probe registers we're using and allows the non-SVE probe to be
> +whichever register comes first (as for SVE).
> +
> +The patch also avoids unnecessary probes in sve/pcs/stack_clash_3.c.
> +
> +gcc/
> + * config/aarch64/aarch64.h (aarch64_frame::sve_save_and_probe)
> + (aarch64_frame::hard_fp_save_and_probe): New fields.
> + * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize
> them.
> + Rather than asserting that a leaf function saves LR, instead assert
> + that a leaf function saves something.
> + (aarch64_get_separate_components): Prevent the chosen probe
> + registers from being individually shrink-wrapped.
> + (aarch64_allocate_and_probe_stack_space): Remove workaround for
> + probe registers that aren't at the bottom of the previous
> allocation.
> +
> +gcc/testsuite/
> + * gcc.target/aarch64/sve/pcs/stack_clash_3.c: Avoid redundant
> probes.
> +---
> + gcc/config/aarch64/aarch64.cc | 68 +++++++++++++++----
> + gcc/config/aarch64/aarch64.h | 8 +++
> + .../aarch64/sve/pcs/stack_clash_3.c | 6 +-
> + 3 files changed, 64 insertions(+), 18 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index 8abf3d7a1e2..a8d907df884 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8210,15 +8210,11 @@ aarch64_layout_frame (void)
> + && !crtl->abi->clobbers_full_reg_p (regno))
> + frame.reg_offset[regno] = SLOT_REQUIRED;
> +
> +- /* With stack-clash, LR must be saved in non-leaf functions. The
> saving of
> +- LR counts as an implicit probe which allows us to maintain the
> invariant
> +- described in the comment at expand_prologue. */
> +- gcc_assert (crtl->is_leaf
> +- || maybe_ne (frame.reg_offset[R30_REGNUM],
> SLOT_NOT_REQUIRED));
> +
> + poly_int64 offset = crtl->outgoing_args_size;
> + gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
> + frame.bytes_below_saved_regs = offset;
> ++ frame.sve_save_and_probe = INVALID_REGNUM;
> +
> + /* Now assign stack slots for the registers. Start with the predicate
> + registers, since predicate LDR and STR have a relatively small
> +@@ -8226,6 +8222,8 @@ aarch64_layout_frame (void)
> + for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
> + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
> + {
> ++ if (frame.sve_save_and_probe == INVALID_REGNUM)
> ++ frame.sve_save_and_probe = regno;
> + frame.reg_offset[regno] = offset;
> + offset += BYTES_PER_SVE_PRED;
> + }
> +@@ -8263,6 +8261,8 @@ aarch64_layout_frame (void)
> + for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
> + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
> + {
> ++ if (frame.sve_save_and_probe == INVALID_REGNUM)
> ++ frame.sve_save_and_probe = regno;
> + frame.reg_offset[regno] = offset;
> + offset += vector_save_size;
> + }
> +@@ -8272,10 +8272,18 @@ aarch64_layout_frame (void)
> + frame.below_hard_fp_saved_regs_size = offset -
> frame.bytes_below_saved_regs;
> + bool saves_below_hard_fp_p
> + = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
> ++ gcc_assert (!saves_below_hard_fp_p
> ++ || (frame.sve_save_and_probe != INVALID_REGNUM
> ++ && known_eq (frame.reg_offset[frame.sve_save_and_probe],
> ++ frame.bytes_below_saved_regs)));
> ++
> + frame.bytes_below_hard_fp = offset;
> ++ frame.hard_fp_save_and_probe = INVALID_REGNUM;
> +
> + auto allocate_gpr_slot = [&](unsigned int regno)
> + {
> ++ if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
> ++ frame.hard_fp_save_and_probe = regno;
> + frame.reg_offset[regno] = offset;
> + if (frame.wb_push_candidate1 == INVALID_REGNUM)
> + frame.wb_push_candidate1 = regno;
> +@@ -8309,6 +8317,8 @@ aarch64_layout_frame (void)
> + for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
> + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
> + {
> ++ if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
> ++ frame.hard_fp_save_and_probe = regno;
> + /* If there is an alignment gap between integer and fp
> callee-saves,
> + allocate the last fp register to it if possible. */
> + if (regno == last_fp_reg
> +@@ -8332,6 +8342,17 @@ aarch64_layout_frame (void)
> + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> +
> + frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
> ++ gcc_assert (known_eq (frame.saved_regs_size,
> ++ frame.below_hard_fp_saved_regs_size)
> ++ || (frame.hard_fp_save_and_probe != INVALID_REGNUM
> ++ && known_eq
> (frame.reg_offset[frame.hard_fp_save_and_probe],
> ++ frame.bytes_below_hard_fp)));
> ++
> ++ /* With stack-clash, a register must be saved in non-leaf functions.
> ++ The saving of the bottommost register counts as an implicit probe,
> ++ which allows us to maintain the invariant described in the comment
> ++ at expand_prologue. */
> ++ gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
> +
> + offset += get_frame_size ();
> + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> +@@ -8462,6 +8483,25 @@ aarch64_layout_frame (void)
> + frame.final_adjust = frame.bytes_below_saved_regs;
> + }
> +
> ++ /* The frame is allocated in pieces, with each non-final piece
> ++ including a register save at offset 0 that acts as a probe for
> ++ the following piece. In addition, the save of the bottommost
> register
> ++ acts as a probe for callees and allocas. Roll back any probes that
> ++ aren't needed.
> ++
> ++ A probe isn't needed if it is associated with the final allocation
> ++ (including callees and allocas) that happens before the epilogue is
> ++ executed. */
> ++ if (crtl->is_leaf
> ++ && !cfun->calls_alloca
> ++ && known_eq (frame.final_adjust, 0))
> ++ {
> ++ if (maybe_ne (frame.sve_callee_adjust, 0))
> ++ frame.sve_save_and_probe = INVALID_REGNUM;
> ++ else
> ++ frame.hard_fp_save_and_probe = INVALID_REGNUM;
> ++ }
> ++
> + /* Make sure the individual adjustments add up to the full frame
> size. */
> + gcc_assert (known_eq (frame.initial_adjust
> + + frame.callee_adjust
> +@@ -9039,13 +9079,6 @@ aarch64_get_separate_components (void)
> +
> + poly_int64 offset = frame.reg_offset[regno];
> +
> +- /* If the register is saved in the first SVE save slot, we use
> +- it as a stack probe for -fstack-clash-protection. */
> +- if (flag_stack_clash_protection
> +- && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
> +- && known_eq (offset, frame.bytes_below_saved_regs))
> +- continue;
> +-
> + /* Get the offset relative to the register we'll use. */
> + if (frame_pointer_needed)
> + offset -= frame.bytes_below_hard_fp;
> +@@ -9080,6 +9113,13 @@ aarch64_get_separate_components (void)
> +
> + bitmap_clear_bit (components, LR_REGNUM);
> + bitmap_clear_bit (components, SP_REGNUM);
> ++ if (flag_stack_clash_protection)
> ++ {
> ++ if (frame.sve_save_and_probe != INVALID_REGNUM)
> ++ bitmap_clear_bit (components, frame.sve_save_and_probe);
> ++ if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
> ++ bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
> ++ }
> +
> + return components;
> + }
> +@@ -9616,8 +9656,8 @@ aarch64_epilogue_uses (int regno)
> + When probing is needed, we emit a probe at the start of the prologue
> + and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
> +
> +- We have to track how much space has been allocated and the only stores
> +- to the stack we track as implicit probes are the FP/LR stores.
> ++ We can also use register saves as probes. These are stored in
> ++ sve_save_and_probe and hard_fp_save_and_probe.
> +
> + For outgoing arguments we probe if the size is larger than 1KB, such
> that
> + the ABI specified buffer is maintained for the next callee.
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index b6135837073..46d4693e206 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -957,6 +957,14 @@ struct GTY (()) aarch64_frame
> + This is the register they should use. */
> + unsigned spare_pred_reg;
> +
> ++ /* An SVE register that is saved below the hard frame pointer and that
> acts
> ++ as a probe for later allocations, or INVALID_REGNUM if none. */
> ++ unsigned sve_save_and_probe;
> ++
> ++ /* A register that is saved at the hard frame pointer and that acts
> ++ as a probe for later allocations, or INVALID_REGNUM if none. */
> ++ unsigned hard_fp_save_and_probe;
> ++
> + bool laid_out;
> +
> + /* True if shadow call stack should be enabled for the current
> function. */
> +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
> +index 3e01ec36c3a..3530a0d504b 100644
> +--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
> ++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
> +@@ -11,11 +11,10 @@
> + ** mov x11, sp
> + ** ...
> + ** sub sp, sp, x13
> +-** str p4, \[sp\]
> + ** cbz w0, [^\n]*
> ++** str p4, \[sp\]
> + ** ...
> + ** ptrue p0\.b, all
> +-** ldr p4, \[sp\]
> + ** addvl sp, sp, #1
> + ** ldr x24, \[sp\], 32
> + ** ret
> +@@ -39,13 +38,12 @@ test_1 (int n)
> + ** mov x11, sp
> + ** ...
> + ** sub sp, sp, x13
> +-** str p4, \[sp\]
> + ** cbz w0, [^\n]*
> ++** str p4, \[sp\]
> + ** str p5, \[sp, #1, mul vl\]
> + ** str p6, \[sp, #2, mul vl\]
> + ** ...
> + ** ptrue p0\.b, all
> +-** ldr p4, \[sp\]
> + ** addvl sp, sp, #1
> + ** ldr x24, \[sp\], 32
> + ** ret
> +--
> +2.34.1
> +
> +
> +From 8254e1b9cd500e0c278465a3657543477e9d1250 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:56 +0100
> +Subject: [PATCH 18/19] aarch64: Remove below_hard_fp_saved_regs_size
> +
> +After previous patches, it's no longer necessary to store
> +saved_regs_size and below_hard_fp_saved_regs_size in the frame info.
> +All measurements instead use the top or bottom of the frame as
> +reference points.
> +
> +gcc/
> + * config/aarch64/aarch64.h (aarch64_frame::saved_regs_size)
> + (aarch64_frame::below_hard_fp_saved_regs_size): Delete.
> + * config/aarch64/aarch64.cc (aarch64_layout_frame): Update
> accordingly.
> +---
> + gcc/config/aarch64/aarch64.cc | 45 ++++++++++++++++-------------------
> + gcc/config/aarch64/aarch64.h | 7 ------
> + 2 files changed, 21 insertions(+), 31 deletions(-)
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index a8d907df884..ac3d3b336a3 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8269,9 +8269,8 @@ aarch64_layout_frame (void)
> +
> + /* OFFSET is now the offset of the hard frame pointer from the bottom
> + of the callee save area. */
> +- frame.below_hard_fp_saved_regs_size = offset -
> frame.bytes_below_saved_regs;
> +- bool saves_below_hard_fp_p
> +- = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
> ++ auto below_hard_fp_saved_regs_size = offset -
> frame.bytes_below_saved_regs;
> ++ bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size,
> 0);
> + gcc_assert (!saves_below_hard_fp_p
> + || (frame.sve_save_and_probe != INVALID_REGNUM
> + && known_eq (frame.reg_offset[frame.sve_save_and_probe],
> +@@ -8341,9 +8340,8 @@ aarch64_layout_frame (void)
> +
> + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> +
> +- frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
> +- gcc_assert (known_eq (frame.saved_regs_size,
> +- frame.below_hard_fp_saved_regs_size)
> ++ auto saved_regs_size = offset - frame.bytes_below_saved_regs;
> ++ gcc_assert (known_eq (saved_regs_size, below_hard_fp_saved_regs_size)
> + || (frame.hard_fp_save_and_probe != INVALID_REGNUM
> + && known_eq
> (frame.reg_offset[frame.hard_fp_save_and_probe],
> + frame.bytes_below_hard_fp)));
> +@@ -8352,7 +8350,7 @@ aarch64_layout_frame (void)
> + The saving of the bottommost register counts as an implicit probe,
> + which allows us to maintain the invariant described in the comment
> + at expand_prologue. */
> +- gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
> ++ gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
> +
> + offset += get_frame_size ();
> + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> +@@ -8409,7 +8407,7 @@ aarch64_layout_frame (void)
> +
> + HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
> + HOST_WIDE_INT const_saved_regs_size;
> +- if (known_eq (frame.saved_regs_size, 0))
> ++ if (known_eq (saved_regs_size, 0))
> + frame.initial_adjust = frame.frame_size;
> + else if (frame.frame_size.is_constant (&const_size)
> + && const_size < max_push_offset
> +@@ -8422,7 +8420,7 @@ aarch64_layout_frame (void)
> + frame.callee_adjust = const_size;
> + }
> + else if (frame.bytes_below_saved_regs.is_constant
> (&const_below_saved_regs)
> +- && frame.saved_regs_size.is_constant (&const_saved_regs_size)
> ++ && saved_regs_size.is_constant (&const_saved_regs_size)
> + && const_below_saved_regs + const_saved_regs_size < 512
> + /* We could handle this case even with data below the saved
> + registers, provided that that data left us with valid offsets
> +@@ -8441,8 +8439,7 @@ aarch64_layout_frame (void)
> + frame.initial_adjust = frame.frame_size;
> + }
> + else if (saves_below_hard_fp_p
> +- && known_eq (frame.saved_regs_size,
> +- frame.below_hard_fp_saved_regs_size))
> ++ && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
> + {
> + /* Frame in which all saves are SVE saves:
> +
> +@@ -8464,7 +8461,7 @@ aarch64_layout_frame (void)
> + [save SVE registers relative to SP]
> + sub sp, sp, bytes_below_saved_regs */
> + frame.callee_adjust = const_above_fp;
> +- frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
> ++ frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
> + frame.final_adjust = frame.bytes_below_saved_regs;
> + }
> + else
> +@@ -8479,7 +8476,7 @@ aarch64_layout_frame (void)
> + [save SVE registers relative to SP]
> + sub sp, sp, bytes_below_saved_regs */
> + frame.initial_adjust = frame.bytes_above_hard_fp;
> +- frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
> ++ frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
> + frame.final_adjust = frame.bytes_below_saved_regs;
> + }
> +
> +@@ -9621,17 +9618,17 @@ aarch64_epilogue_uses (int regno)
> + | local variables | <-- frame_pointer_rtx
> + | |
> + +-------------------------------+
> +- | padding | \
> +- +-------------------------------+ |
> +- | callee-saved registers | | frame.saved_regs_size
> +- +-------------------------------+ |
> +- | LR' | |
> +- +-------------------------------+ |
> +- | FP' | |
> +- +-------------------------------+ |<- hard_frame_pointer_rtx
> (aligned)
> +- | SVE vector registers | | \
> +- +-------------------------------+ | |
> below_hard_fp_saved_regs_size
> +- | SVE predicate registers | / /
> ++ | padding |
> ++ +-------------------------------+
> ++ | callee-saved registers |
> ++ +-------------------------------+
> ++ | LR' |
> ++ +-------------------------------+
> ++ | FP' |
> ++ +-------------------------------+ <-- hard_frame_pointer_rtx
> (aligned)
> ++ | SVE vector registers |
> ++ +-------------------------------+
> ++ | SVE predicate registers |
> + +-------------------------------+
> + | dynamic allocation |
> + +-------------------------------+
> +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> +index 46d4693e206..01f7751bc78 100644
> +--- a/gcc/config/aarch64/aarch64.h
> ++++ b/gcc/config/aarch64/aarch64.h
> +@@ -871,18 +871,11 @@ struct GTY (()) aarch64_frame
> + STACK_BOUNDARY. */
> + HOST_WIDE_INT saved_varargs_size;
> +
> +- /* The size of the callee-save registers with a slot in REG_OFFSET. */
> +- poly_int64 saved_regs_size;
> +-
> + /* The number of bytes between the bottom of the static frame (the
> bottom
> + of the outgoing arguments) and the bottom of the register save area.
> + This value is always a multiple of STACK_BOUNDARY. */
> + poly_int64 bytes_below_saved_regs;
> +
> +- /* The size of the callee-save registers with a slot in REG_OFFSET that
> +- are saved below the hard frame pointer. */
> +- poly_int64 below_hard_fp_saved_regs_size;
> +-
> + /* The number of bytes between the bottom of the static frame (the
> bottom
> + of the outgoing arguments) and the hard frame pointer. This value
> is
> + always a multiple of STACK_BOUNDARY. */
> +--
> +2.34.1
> +
> +
> +From 75c37e031408262263442f5b4cdb83d3777b6422 Mon Sep 17 00:00:00 2001
> +From: Richard Sandiford <richard.sandiford@arm.com>
> +Date: Tue, 12 Sep 2023 16:08:57 +0100
> +Subject: [PATCH 19/19] aarch64: Make stack smash canary protect saved
> + registers
> +
> +AArch64 normally puts the saved registers near the bottom of the frame,
> +immediately above any dynamic allocations. But this means that a
> +stack-smash attack on those dynamic allocations could overwrite the
> +saved registers without needing to reach as far as the stack smash
> +canary.
> +
> +The same thing could also happen for variable-sized arguments that are
> +passed by value, since those are allocated before a call and popped on
> +return.
> +
> +This patch avoids that by putting the locals (and thus the canary) below
> +the saved registers when stack smash protection is active.
> +
> +The patch fixes CVE-2023-4039.
> +
> +gcc/
> + * config/aarch64/aarch64.cc (aarch64_save_regs_above_locals_p):
> + New function.
> + (aarch64_layout_frame): Use it to decide whether locals should
> + go above or below the saved registers.
> + (aarch64_expand_prologue): Update stack layout comment.
> + Emit a stack tie after the final adjustment.
> +
> +gcc/testsuite/
> + * gcc.target/aarch64/stack-protector-8.c: New test.
> + * gcc.target/aarch64/stack-protector-9.c: Likewise.
> +---
> + gcc/config/aarch64/aarch64.cc | 46 +++++++--
> + .../gcc.target/aarch64/stack-protector-8.c | 95 +++++++++++++++++++
> + .../gcc.target/aarch64/stack-protector-9.c | 33 +++++++
> + 3 files changed, 168 insertions(+), 6 deletions(-)
> + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
> + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
> +
> +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> +index ac3d3b336a3..96c3f48fdc4 100644
> +--- a/gcc/config/aarch64/aarch64.cc
> ++++ b/gcc/config/aarch64/aarch64.cc
> +@@ -8133,6 +8133,20 @@ aarch64_needs_frame_chain (void)
> + return aarch64_use_frame_pointer;
> + }
> +
> ++/* Return true if the current function should save registers above
> ++ the locals area, rather than below it. */
> ++
> ++static bool
> ++aarch64_save_regs_above_locals_p ()
> ++{
> ++ /* When using stack smash protection, make sure that the canary slot
> ++ comes between the locals and the saved registers. Otherwise,
> ++ it would be possible for a carefully sized smash attack to change
> ++ the saved registers (particularly LR and FP) without reaching the
> ++ canary. */
> ++ return crtl->stack_protect_guard;
> ++}
> ++
> + /* Mark the registers that need to be saved by the callee and calculate
> + the size of the callee-saved registers area and frame record (both FP
> + and LR may be omitted). */
> +@@ -8144,6 +8158,7 @@ aarch64_layout_frame (void)
> + poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
> + bool frame_related_fp_reg_p = false;
> + aarch64_frame &frame = cfun->machine->frame;
> ++ poly_int64 top_of_locals = -1;
> +
> + frame.emit_frame_chain = aarch64_needs_frame_chain ();
> +
> +@@ -8210,9 +8225,16 @@ aarch64_layout_frame (void)
> + && !crtl->abi->clobbers_full_reg_p (regno))
> + frame.reg_offset[regno] = SLOT_REQUIRED;
> +
> ++ bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
> +
> + poly_int64 offset = crtl->outgoing_args_size;
> + gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
> ++ if (regs_at_top_p)
> ++ {
> ++ offset += get_frame_size ();
> ++ offset = aligned_upper_bound (offset, STACK_BOUNDARY /
> BITS_PER_UNIT);
> ++ top_of_locals = offset;
> ++ }
> + frame.bytes_below_saved_regs = offset;
> + frame.sve_save_and_probe = INVALID_REGNUM;
> +
> +@@ -8352,15 +8374,18 @@ aarch64_layout_frame (void)
> + at expand_prologue. */
> + gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
> +
> +- offset += get_frame_size ();
> +- offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
> +- auto top_of_locals = offset;
> +-
> ++ if (!regs_at_top_p)
> ++ {
> ++ offset += get_frame_size ();
> ++ offset = aligned_upper_bound (offset, STACK_BOUNDARY /
> BITS_PER_UNIT);
> ++ top_of_locals = offset;
> ++ }
> + offset += frame.saved_varargs_size;
> + gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
> + frame.frame_size = offset;
> +
> + frame.bytes_above_hard_fp = frame.frame_size -
> frame.bytes_below_hard_fp;
> ++ gcc_assert (known_ge (top_of_locals, 0));
> + frame.bytes_above_locals = frame.frame_size - top_of_locals;
> +
> + frame.initial_adjust = 0;
> +@@ -9615,10 +9640,10 @@ aarch64_epilogue_uses (int regno)
> + | for register varargs |
> + | |
> + +-------------------------------+
> +- | local variables | <-- frame_pointer_rtx
> ++ | local variables (1) | <-- frame_pointer_rtx
> + | |
> + +-------------------------------+
> +- | padding |
> ++ | padding (1) |
> + +-------------------------------+
> + | callee-saved registers |
> + +-------------------------------+
> +@@ -9630,6 +9655,10 @@ aarch64_epilogue_uses (int regno)
> + +-------------------------------+
> + | SVE predicate registers |
> + +-------------------------------+
> ++ | local variables (2) |
> ++ +-------------------------------+
> ++ | padding (2) |
> ++ +-------------------------------+
> + | dynamic allocation |
> + +-------------------------------+
> + | padding |
> +@@ -9639,6 +9668,9 @@ aarch64_epilogue_uses (int regno)
> + +-------------------------------+
> + | | <-- stack_pointer_rtx (aligned)
> +
> ++ The regions marked (1) and (2) are mutually exclusive. (2) is used
> ++ when aarch64_save_regs_above_locals_p is true.
> ++
> + Dynamic stack allocations via alloca() decrease stack_pointer_rtx
> + but leave frame_pointer_rtx and hard_frame_pointer_rtx
> + unchanged.
> +@@ -9834,6 +9866,8 @@ aarch64_expand_prologue (void)
> + gcc_assert (known_eq (bytes_below_sp, final_adjust));
> + aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
> final_adjust,
> + !frame_pointer_needed, true);
> ++ if (emit_frame_chain && maybe_ne (final_adjust, 0))
> ++ emit_insn (gen_stack_tie (stack_pointer_rtx,
> hard_frame_pointer_rtx));
> + }
> +
> + /* Return TRUE if we can use a simple_return insn.
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
> b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
> +new file mode 100644
> +index 00000000000..e71d820e365
> +--- /dev/null
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c
> +@@ -0,0 +1,95 @@
> ++/* { dg-options " -O -fstack-protector-strong
> -mstack-protector-guard=sysreg -mstack-protector-guard-reg=tpidr2_el0
> -mstack-protector-guard-offset=16" } */
> ++/* { dg-final { check-function-bodies "**" "" } } */
> ++
> ++void g(void *);
> ++__SVBool_t *h(void *);
> ++
> ++/*
> ++** test1:
> ++** sub sp, sp, #288
> ++** stp x29, x30, \[sp, #?272\]
> ++** add x29, sp, #?272
> ++** mrs (x[0-9]+), tpidr2_el0
> ++** ldr (x[0-9]+), \[\1, #?16\]
> ++** str \2, \[sp, #?264\]
> ++** mov \2, #?0
> ++** add x0, sp, #?8
> ++** bl g
> ++** ...
> ++** mrs .*
> ++** ...
> ++** bne .*
> ++** ...
> ++** ldp x29, x30, \[sp, #?272\]
> ++** add sp, sp, #?288
> ++** ret
> ++** bl __stack_chk_fail
> ++*/
> ++int test1() {
> ++ int y[0x40];
> ++ g(y);
> ++ return 1;
> ++}
> ++
> ++/*
> ++** test2:
> ++** stp x29, x30, \[sp, #?-16\]!
> ++** mov x29, sp
> ++** sub sp, sp, #1040
> ++** mrs (x[0-9]+), tpidr2_el0
> ++** ldr (x[0-9]+), \[\1, #?16\]
> ++** str \2, \[sp, #?1032\]
> ++** mov \2, #?0
> ++** add x0, sp, #?8
> ++** bl g
> ++** ...
> ++** mrs .*
> ++** ...
> ++** bne .*
> ++** ...
> ++** add sp, sp, #?1040
> ++** ldp x29, x30, \[sp\], #?16
> ++** ret
> ++** bl __stack_chk_fail
> ++*/
> ++int test2() {
> ++ int y[0x100];
> ++ g(y);
> ++ return 1;
> ++}
> ++
> ++#pragma GCC target "+sve"
> ++
> ++/*
> ++** test3:
> ++** stp x29, x30, \[sp, #?-16\]!
> ++** mov x29, sp
> ++** addvl sp, sp, #-18
> ++** ...
> ++** str p4, \[sp\]
> ++** ...
> ++** sub sp, sp, #272
> ++** mrs (x[0-9]+), tpidr2_el0
> ++** ldr (x[0-9]+), \[\1, #?16\]
> ++** str \2, \[sp, #?264\]
> ++** mov \2, #?0
> ++** add x0, sp, #?8
> ++** bl h
> ++** ...
> ++** mrs .*
> ++** ...
> ++** bne .*
> ++** ...
> ++** add sp, sp, #?272
> ++** ...
> ++** ldr p4, \[sp\]
> ++** ...
> ++** addvl sp, sp, #18
> ++** ldp x29, x30, \[sp\], #?16
> ++** ret
> ++** bl __stack_chk_fail
> ++*/
> ++__SVBool_t test3() {
> ++ int y[0x40];
> ++ return *h(y);
> ++}
> +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
> b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
> +new file mode 100644
> +index 00000000000..58f322aa480
> +--- /dev/null
> ++++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c
> +@@ -0,0 +1,33 @@
> ++/* { dg-options "-O2 -mcpu=neoverse-v1 -fstack-protector-all" } */
> ++/* { dg-final { check-function-bodies "**" "" } } */
> ++
> ++/*
> ++** main:
> ++** ...
> ++** stp x29, x30, \[sp, #?-[0-9]+\]!
> ++** ...
> ++** sub sp, sp, #[0-9]+
> ++** ...
> ++** str x[0-9]+, \[x29, #?-8\]
> ++** ...
> ++*/
> ++int f(const char *);
> ++void g(void *);
> ++int main(int argc, char* argv[])
> ++{
> ++ int a;
> ++ int b;
> ++ char c[2+f(argv[1])];
> ++ int d[0x100];
> ++ char y;
> ++
> ++ y=42; a=4; b=10;
> ++ c[0] = 'h'; c[1] = '\0';
> ++
> ++ c[f(argv[2])] = '\0';
> ++
> ++ __builtin_printf("%d %d\n%s\n", a, b, c);
> ++ g(d);
> ++
> ++ return 0;
> ++}
> +--
> +2.34.1
> +
> --
> 2.34.1
>
>
> -=-=-=-=-=-=-=-=-=-=-=-
> Links: You receive all messages sent to this group.
> View/Reply Online (#187543):
> https://lists.openembedded.org/g/openembedded-core/message/187543
> Mute This Topic: https://lists.openembedded.org/mt/101319990/3617156
> Group Owner: openembedded-core+owner@lists.openembedded.org
> Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [
> martin.jansa@gmail.com]
> -=-=-=-=-=-=-=-=-=-=-=-
>
>
On Thu, Sep 14, 2023 at 11:07 AM Martin Jansa <martin.jansa@gmail.com> wrote: > FYI: one of LGE proprietary components triggers ICE with this applied, > I'll try to find minimal reproducer later, this is just for other people > who might hit the same: > > error: unrecognizable insn: > 2923 | } > | ^ > (insn 416 286 290 17 (parallel [ > (set (mem/c:SI (plus:DI (reg/f:DI 29 x29) > (const_int -260 [0xfffffffffffffefc])) [1 > redacted.pixel_format+0 S4 A32]) > (const_int 0 [0])) > (set (mem/c:SI (plus:DI (reg/f:DI 29 x29) > (const_int -256 [0xffffffffffffff00])) [1 > redacted.pixel_value+0 S4 A128]) > (reg/v:SI 22 x22 [orig:141 color ] [141])) > ]) > "TOPDIR/BUILD/work/redacted/redacted/redacted/git/redacted.c":2903:45 -1 > (expr_list:REG_DEAD (reg/v:SI 22 x22 [orig:141 color ] [141]) > (nil))) > during RTL pass: cprop_hardreg > TOPDIR/BUILD/work/redacted/redacted/redacted/git/redacted.c:2923:1: > internal compiler error: in extract_insn, at recog.cc:2791 > 0x191624a internal_error(char const*, ...) > ???:0 > 0x6bee26 fancy_abort(char const*, int, char const*) > ???:0 > 0x697469 _fatal_insn(char const*, rtx_def const*, char const*, int, char > const*) > ???:0 > 0x697485 _fatal_insn_not_found(rtx_def const*, char const*, int, char > const*) > ???:0 > 0xbef198 extract_constrain_insn(rtx_insn*) > ???:0 > And the same code fails like this only with gcc-12.3 in mickledore and gcc-13.2 in nanbield. kirkstone with gcc-11.4 and your patch (as it is in kirkstone-nut) builds the same code fine.
On 14 Sep 2023, at 10:07, Martin Jansa via lists.openembedded.org <Martin.Jansa=gmail.com@lists.openembedded.org> wrote: > > FYI: one of LGE proprietary components triggers ICE with this applied, I'll try to find minimal reproducer later, this is just for other people who might hit the same: That’s… upsetting. I’ve forwarded this to our toolchain team. If you can whittle down a reproducer that would be _much_ appreciated, but I’ll see if they have any ideas about where the issue might be. Ross
diff --git a/meta/recipes-devtools/gcc/gcc-12.3.inc b/meta/recipes-devtools/gcc/gcc-12.3.inc index 4ec03f925c8..5896f26e1af 100644 --- a/meta/recipes-devtools/gcc/gcc-12.3.inc +++ b/meta/recipes-devtools/gcc/gcc-12.3.inc @@ -63,6 +63,7 @@ SRC_URI = "${BASEURI} \ file://0026-rust-recursion-limit.patch \ file://prefix-map-realpath.patch \ file://hardcoded-paths.patch \ + file://CVE-2023-4039.patch \ " SRC_URI[sha256sum] = "949a5d4f99e786421a93b532b22ffab5578de7321369975b91aec97adfda8c3b" diff --git a/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch new file mode 100644 index 00000000000..8cb52849cd3 --- /dev/null +++ b/meta/recipes-devtools/gcc/gcc/CVE-2023-4039.patch @@ -0,0 +1,3093 @@ +From: Richard Sandiford <richard.sandiford@arm.com> +Subject: [PATCH 00/19] aarch64: Fix -fstack-protector issue +Date: Tue, 12 Sep 2023 16:25:10 +0100 + +This series of patches fixes deficiencies in GCC's -fstack-protector +implementation for AArch64 when using dynamically allocated stack space. +This is CVE-2023-4039. See: + +https://developer.arm.com/Arm%20Security%20Center/GCC%20Stack%20Protector%20Vulnerability%20AArch64 +https://github.com/metaredteam/external-disclosures/security/advisories/GHSA-x7ch-h5rf-w2mf + +for more details. + +The fix is to put the saved registers above the locals area when +-fstack-protector is used. + +The series also fixes a stack-clash problem that I found while working +on the CVE. In unpatched sources, the stack-clash problem would only +trigger for unrealistic numbers of arguments (8K 64-bit arguments, or an +equivalent). But it would be a more significant issue with the new +-fstack-protector frame layout. It's therefore important that both +problems are fixed together. + +Some reorganisation of the code seemed necessary to fix the problems in a +cleanish way. The series is therefore quite long, but only a handful of +patches should have any effect on code generation. + +See the individual patches for a detailed description. + +Tested on aarch64-linux-gnu. Pushed to trunk and to all active branches. +I've also pushed backports to GCC 7+ to vendors/ARM/heads/CVE-2023-4039. + +CVE: CVE-2023-4039 +Upstream-Status: Backport +Signed-off-by: Ross Burton <ross.burton@arm.com> + + +From 62fbb215cc817e9f2c1ca80282a64f4ee30806bc Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:48 +0100 +Subject: [PATCH 01/19] aarch64: Use local frame vars in shrink-wrapping code + +aarch64_layout_frame uses a shorthand for referring to +cfun->machine->frame: + + aarch64_frame &frame = cfun->machine->frame; + +This patch does the same for some other heavy users of the structure. +No functional change intended. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_save_callee_saves): Use + a local shorthand for cfun->machine->frame. + (aarch64_restore_callee_saves, aarch64_get_separate_components): + (aarch64_process_components): Likewise. + (aarch64_allocate_and_probe_stack_space): Likewise. + (aarch64_expand_prologue, aarch64_expand_epilogue): Likewise. + (aarch64_layout_frame): Use existing shorthand for one more case. +--- + gcc/config/aarch64/aarch64.cc | 123 ++++++++++++++++++---------------- + 1 file changed, 64 insertions(+), 59 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 226dc9dffd4..ae42ffdedbe 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8351,7 +8351,7 @@ aarch64_layout_frame (void) + frame.is_scs_enabled + = (!crtl->calls_eh_return + && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK) +- && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)); ++ && known_ge (frame.reg_offset[LR_REGNUM], 0)); + + /* When shadow call stack is enabled, the scs_pop in the epilogue will + restore x30, and we don't need to pop x30 again in the traditional +@@ -8763,6 +8763,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, + unsigned start, unsigned limit, bool skip_wb, + bool hard_fp_valid_p) + { ++ aarch64_frame &frame = cfun->machine->frame; + rtx_insn *insn; + unsigned regno; + unsigned regno2; +@@ -8777,8 +8778,8 @@ aarch64_save_callee_saves (poly_int64 start_offset, + bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno); + + if (skip_wb +- && (regno == cfun->machine->frame.wb_push_candidate1 +- || regno == cfun->machine->frame.wb_push_candidate2)) ++ && (regno == frame.wb_push_candidate1 ++ || regno == frame.wb_push_candidate2)) + continue; + + if (cfun->machine->reg_is_wrapped_separately[regno]) +@@ -8786,7 +8787,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, + + machine_mode mode = aarch64_reg_save_mode (regno); + reg = gen_rtx_REG (mode, regno); +- offset = start_offset + cfun->machine->frame.reg_offset[regno]; ++ offset = start_offset + frame.reg_offset[regno]; + rtx base_rtx = stack_pointer_rtx; + poly_int64 sp_offset = offset; + +@@ -8799,7 +8800,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, + { + gcc_assert (known_eq (start_offset, 0)); + poly_int64 fp_offset +- = cfun->machine->frame.below_hard_fp_saved_regs_size; ++ = frame.below_hard_fp_saved_regs_size; + if (hard_fp_valid_p) + base_rtx = hard_frame_pointer_rtx; + else +@@ -8821,8 +8822,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, + && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit + && !cfun->machine->reg_is_wrapped_separately[regno2] + && known_eq (GET_MODE_SIZE (mode), +- cfun->machine->frame.reg_offset[regno2] +- - cfun->machine->frame.reg_offset[regno])) ++ frame.reg_offset[regno2] - frame.reg_offset[regno])) + { + rtx reg2 = gen_rtx_REG (mode, regno2); + rtx mem2; +@@ -8872,6 +8872,7 @@ static void + aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, + unsigned limit, bool skip_wb, rtx *cfi_ops) + { ++ aarch64_frame &frame = cfun->machine->frame; + unsigned regno; + unsigned regno2; + poly_int64 offset; +@@ -8888,13 +8889,13 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, + rtx reg, mem; + + if (skip_wb +- && (regno == cfun->machine->frame.wb_pop_candidate1 +- || regno == cfun->machine->frame.wb_pop_candidate2)) ++ && (regno == frame.wb_pop_candidate1 ++ || regno == frame.wb_pop_candidate2)) + continue; + + machine_mode mode = aarch64_reg_save_mode (regno); + reg = gen_rtx_REG (mode, regno); +- offset = start_offset + cfun->machine->frame.reg_offset[regno]; ++ offset = start_offset + frame.reg_offset[regno]; + rtx base_rtx = stack_pointer_rtx; + if (mode == VNx2DImode && BYTES_BIG_ENDIAN) + aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, +@@ -8905,8 +8906,7 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, + && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit + && !cfun->machine->reg_is_wrapped_separately[regno2] + && known_eq (GET_MODE_SIZE (mode), +- cfun->machine->frame.reg_offset[regno2] +- - cfun->machine->frame.reg_offset[regno])) ++ frame.reg_offset[regno2] - frame.reg_offset[regno])) + { + rtx reg2 = gen_rtx_REG (mode, regno2); + rtx mem2; +@@ -9011,6 +9011,7 @@ offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset) + static sbitmap + aarch64_get_separate_components (void) + { ++ aarch64_frame &frame = cfun->machine->frame; + sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1); + bitmap_clear (components); + +@@ -9027,18 +9028,18 @@ aarch64_get_separate_components (void) + if (mode == VNx2DImode && BYTES_BIG_ENDIAN) + continue; + +- poly_int64 offset = cfun->machine->frame.reg_offset[regno]; ++ poly_int64 offset = frame.reg_offset[regno]; + + /* If the register is saved in the first SVE save slot, we use + it as a stack probe for -fstack-clash-protection. */ + if (flag_stack_clash_protection +- && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0) ++ && maybe_ne (frame.below_hard_fp_saved_regs_size, 0) + && known_eq (offset, 0)) + continue; + + /* Get the offset relative to the register we'll use. */ + if (frame_pointer_needed) +- offset -= cfun->machine->frame.below_hard_fp_saved_regs_size; ++ offset -= frame.below_hard_fp_saved_regs_size; + else + offset += crtl->outgoing_args_size; + +@@ -9057,11 +9058,11 @@ aarch64_get_separate_components (void) + /* If the spare predicate register used by big-endian SVE code + is call-preserved, it must be saved in the main prologue + before any saves that use it. */ +- if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM) +- bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg); ++ if (frame.spare_pred_reg != INVALID_REGNUM) ++ bitmap_clear_bit (components, frame.spare_pred_reg); + +- unsigned reg1 = cfun->machine->frame.wb_push_candidate1; +- unsigned reg2 = cfun->machine->frame.wb_push_candidate2; ++ unsigned reg1 = frame.wb_push_candidate1; ++ unsigned reg2 = frame.wb_push_candidate2; + /* If registers have been chosen to be stored/restored with + writeback don't interfere with them to avoid having to output explicit + stack adjustment instructions. */ +@@ -9170,6 +9171,7 @@ aarch64_get_next_set_bit (sbitmap bmp, unsigned int start) + static void + aarch64_process_components (sbitmap components, bool prologue_p) + { ++ aarch64_frame &frame = cfun->machine->frame; + rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed + ? HARD_FRAME_POINTER_REGNUM + : STACK_POINTER_REGNUM); +@@ -9184,9 +9186,9 @@ aarch64_process_components (sbitmap components, bool prologue_p) + machine_mode mode = aarch64_reg_save_mode (regno); + + rtx reg = gen_rtx_REG (mode, regno); +- poly_int64 offset = cfun->machine->frame.reg_offset[regno]; ++ poly_int64 offset = frame.reg_offset[regno]; + if (frame_pointer_needed) +- offset -= cfun->machine->frame.below_hard_fp_saved_regs_size; ++ offset -= frame.below_hard_fp_saved_regs_size; + else + offset += crtl->outgoing_args_size; + +@@ -9211,14 +9213,14 @@ aarch64_process_components (sbitmap components, bool prologue_p) + break; + } + +- poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2]; ++ poly_int64 offset2 = frame.reg_offset[regno2]; + /* The next register is not of the same class or its offset is not + mergeable with the current one into a pair. */ + if (aarch64_sve_mode_p (mode) + || !satisfies_constraint_Ump (mem) + || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2) + || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno)) +- || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]), ++ || maybe_ne ((offset2 - frame.reg_offset[regno]), + GET_MODE_SIZE (mode))) + { + insn = emit_insn (set); +@@ -9240,7 +9242,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) + /* REGNO2 can be saved/restored in a pair with REGNO. */ + rtx reg2 = gen_rtx_REG (mode, regno2); + if (frame_pointer_needed) +- offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size; ++ offset2 -= frame.below_hard_fp_saved_regs_size; + else + offset2 += crtl->outgoing_args_size; + rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); +@@ -9335,6 +9337,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + bool frame_related_p, + bool final_adjustment_p) + { ++ aarch64_frame &frame = cfun->machine->frame; + HOST_WIDE_INT guard_size + = 1 << param_stack_clash_protection_guard_size; + HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD; +@@ -9355,25 +9358,25 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + register as a probe. We can't assume that LR was saved at position 0 + though, so treat any space below it as unprobed. */ + if (final_adjustment_p +- && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)) ++ && known_eq (frame.below_hard_fp_saved_regs_size, 0)) + { +- poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM]; ++ poly_int64 lr_offset = frame.reg_offset[LR_REGNUM]; + if (known_ge (lr_offset, 0)) + min_probe_threshold -= lr_offset.to_constant (); + else + gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0)); + } + +- poly_int64 frame_size = cfun->machine->frame.frame_size; ++ poly_int64 frame_size = frame.frame_size; + + /* We should always have a positive probe threshold. */ + gcc_assert (min_probe_threshold > 0); + + if (flag_stack_clash_protection && !final_adjustment_p) + { +- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; +- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; +- poly_int64 final_adjust = cfun->machine->frame.final_adjust; ++ poly_int64 initial_adjust = frame.initial_adjust; ++ poly_int64 sve_callee_adjust = frame.sve_callee_adjust; ++ poly_int64 final_adjust = frame.final_adjust; + + if (known_eq (frame_size, 0)) + { +@@ -9662,17 +9665,18 @@ aarch64_epilogue_uses (int regno) + void + aarch64_expand_prologue (void) + { +- poly_int64 frame_size = cfun->machine->frame.frame_size; +- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; +- HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust; +- poly_int64 final_adjust = cfun->machine->frame.final_adjust; +- poly_int64 callee_offset = cfun->machine->frame.callee_offset; +- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; ++ aarch64_frame &frame = cfun->machine->frame; ++ poly_int64 frame_size = frame.frame_size; ++ poly_int64 initial_adjust = frame.initial_adjust; ++ HOST_WIDE_INT callee_adjust = frame.callee_adjust; ++ poly_int64 final_adjust = frame.final_adjust; ++ poly_int64 callee_offset = frame.callee_offset; ++ poly_int64 sve_callee_adjust = frame.sve_callee_adjust; + poly_int64 below_hard_fp_saved_regs_size +- = cfun->machine->frame.below_hard_fp_saved_regs_size; +- unsigned reg1 = cfun->machine->frame.wb_push_candidate1; +- unsigned reg2 = cfun->machine->frame.wb_push_candidate2; +- bool emit_frame_chain = cfun->machine->frame.emit_frame_chain; ++ = frame.below_hard_fp_saved_regs_size; ++ unsigned reg1 = frame.wb_push_candidate1; ++ unsigned reg2 = frame.wb_push_candidate2; ++ bool emit_frame_chain = frame.emit_frame_chain; + rtx_insn *insn; + + if (flag_stack_clash_protection && known_eq (callee_adjust, 0)) +@@ -9703,7 +9707,7 @@ aarch64_expand_prologue (void) + } + + /* Push return address to shadow call stack. */ +- if (cfun->machine->frame.is_scs_enabled) ++ if (frame.is_scs_enabled) + emit_insn (gen_scs_push ()); + + if (flag_stack_usage_info) +@@ -9742,7 +9746,7 @@ aarch64_expand_prologue (void) + + /* The offset of the frame chain record (if any) from the current SP. */ + poly_int64 chain_offset = (initial_adjust + callee_adjust +- - cfun->machine->frame.hard_fp_offset); ++ - frame.hard_fp_offset); + gcc_assert (known_ge (chain_offset, 0)); + + /* The offset of the bottom of the save area from the current SP. */ +@@ -9845,16 +9849,17 @@ aarch64_use_return_insn_p (void) + void + aarch64_expand_epilogue (bool for_sibcall) + { +- poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; +- HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust; +- poly_int64 final_adjust = cfun->machine->frame.final_adjust; +- poly_int64 callee_offset = cfun->machine->frame.callee_offset; +- poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust; ++ aarch64_frame &frame = cfun->machine->frame; ++ poly_int64 initial_adjust = frame.initial_adjust; ++ HOST_WIDE_INT callee_adjust = frame.callee_adjust; ++ poly_int64 final_adjust = frame.final_adjust; ++ poly_int64 callee_offset = frame.callee_offset; ++ poly_int64 sve_callee_adjust = frame.sve_callee_adjust; + poly_int64 below_hard_fp_saved_regs_size +- = cfun->machine->frame.below_hard_fp_saved_regs_size; +- unsigned reg1 = cfun->machine->frame.wb_pop_candidate1; +- unsigned reg2 = cfun->machine->frame.wb_pop_candidate2; +- unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled ++ = frame.below_hard_fp_saved_regs_size; ++ unsigned reg1 = frame.wb_pop_candidate1; ++ unsigned reg2 = frame.wb_pop_candidate2; ++ unsigned int last_gpr = (frame.is_scs_enabled + ? R29_REGNUM : R30_REGNUM); + rtx cfi_ops = NULL; + rtx_insn *insn; +@@ -9888,7 +9893,7 @@ aarch64_expand_epilogue (bool for_sibcall) + /* We need to add memory barrier to prevent read from deallocated stack. */ + bool need_barrier_p + = maybe_ne (get_frame_size () +- + cfun->machine->frame.saved_varargs_size, 0); ++ + frame.saved_varargs_size, 0); + + /* Emit a barrier to prevent loads from a deallocated stack. */ + if (maybe_gt (final_adjust, crtl->outgoing_args_size) +@@ -9969,7 +9974,7 @@ aarch64_expand_epilogue (bool for_sibcall) + } + + /* Pop return address from shadow call stack. */ +- if (cfun->machine->frame.is_scs_enabled) ++ if (frame.is_scs_enabled) + { + machine_mode mode = aarch64_reg_save_mode (R30_REGNUM); + rtx reg = gen_rtx_REG (mode, R30_REGNUM); +@@ -12564,24 +12569,24 @@ aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to) + poly_int64 + aarch64_initial_elimination_offset (unsigned from, unsigned to) + { ++ aarch64_frame &frame = cfun->machine->frame; ++ + if (to == HARD_FRAME_POINTER_REGNUM) + { + if (from == ARG_POINTER_REGNUM) +- return cfun->machine->frame.hard_fp_offset; ++ return frame.hard_fp_offset; + + if (from == FRAME_POINTER_REGNUM) +- return cfun->machine->frame.hard_fp_offset +- - cfun->machine->frame.locals_offset; ++ return frame.hard_fp_offset - frame.locals_offset; + } + + if (to == STACK_POINTER_REGNUM) + { + if (from == FRAME_POINTER_REGNUM) +- return cfun->machine->frame.frame_size +- - cfun->machine->frame.locals_offset; ++ return frame.frame_size - frame.locals_offset; + } + +- return cfun->machine->frame.frame_size; ++ return frame.frame_size; + } + + +-- +2.34.1 + + +From 12a8889de169f892d2e927584c00d20b8b7e456f Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:49 +0100 +Subject: [PATCH 02/19] aarch64: Avoid a use of callee_offset + +When we emit the frame chain, i.e. when we reach Here in this statement +of aarch64_expand_prologue: + + if (emit_frame_chain) + { + // Here + ... + } + +the stack is in one of two states: + +- We've allocated up to the frame chain, but no more. + +- We've allocated the whole frame, and the frame chain is within easy + reach of the new SP. + +The offset of the frame chain from the current SP is available +in aarch64_frame as callee_offset. It is also available as the +chain_offset local variable, where the latter is calculated from other +data. (However, chain_offset is not always equal to callee_offset when +!emit_frame_chain, so chain_offset isn't redundant.) + +In c600df9a4060da3c6121ff4d0b93f179eafd69d1 I switched to using +chain_offset for the initialisation of the hard frame pointer: + + aarch64_add_offset (Pmode, hard_frame_pointer_rtx, +- stack_pointer_rtx, callee_offset, ++ stack_pointer_rtx, chain_offset, + tmp1_rtx, tmp0_rtx, frame_pointer_needed); + +But the later REG_CFA_ADJUST_CFA handling still used callee_offset. + +I think the difference is harmless, but it's more logical for the +CFA note to be in sync, and it's more convenient for later patches +if it uses chain_offset. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_expand_prologue): Use + chain_offset rather than callee_offset. +--- + gcc/config/aarch64/aarch64.cc | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index ae42ffdedbe..79253322fd7 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -9670,7 +9670,6 @@ aarch64_expand_prologue (void) + poly_int64 initial_adjust = frame.initial_adjust; + HOST_WIDE_INT callee_adjust = frame.callee_adjust; + poly_int64 final_adjust = frame.final_adjust; +- poly_int64 callee_offset = frame.callee_offset; + poly_int64 sve_callee_adjust = frame.sve_callee_adjust; + poly_int64 below_hard_fp_saved_regs_size + = frame.below_hard_fp_saved_regs_size; +@@ -9783,8 +9782,7 @@ aarch64_expand_prologue (void) + implicit. */ + if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX)) + { +- rtx src = plus_constant (Pmode, stack_pointer_rtx, +- callee_offset); ++ rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset); + add_reg_note (insn, REG_CFA_ADJUST_CFA, + gen_rtx_SET (hard_frame_pointer_rtx, src)); + } +-- +2.34.1 + + +From 03d5e89e7f3be53fd7142556e8e0a2774c653dca Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:49 +0100 +Subject: [PATCH 03/19] aarch64: Explicitly handle frames with no saved + registers + +If a frame has no saved registers, it can be allocated in one go. +There is no need to treat the areas below and above the saved +registers as separate. + +And if we allocate the frame in one go, it should be allocated +as the initial_adjust rather than the final_adjust. This allows the +frame size to grow to guard_size - guard_used_by_caller before a stack +probe is needed. (A frame with no register saves is necessarily a +leaf frame.) + +This is a no-op as thing stand, since a leaf function will have +no outgoing arguments, and so all the frame will be above where +the saved registers normally go. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_layout_frame): Explicitly + allocate the frame in one go if there are no saved registers. +--- + gcc/config/aarch64/aarch64.cc | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 79253322fd7..e1f21230c15 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8378,9 +8378,11 @@ aarch64_layout_frame (void) + + HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset; + HOST_WIDE_INT const_saved_regs_size; +- if (frame.frame_size.is_constant (&const_size) +- && const_size < max_push_offset +- && known_eq (frame.hard_fp_offset, const_size)) ++ if (known_eq (frame.saved_regs_size, 0)) ++ frame.initial_adjust = frame.frame_size; ++ else if (frame.frame_size.is_constant (&const_size) ++ && const_size < max_push_offset ++ && known_eq (frame.hard_fp_offset, const_size)) + { + /* Simple, small frame with no outgoing arguments: + +-- +2.34.1 + + +From 49c2eb7616756c323b7f6b18d8616ec945eb1263 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:49 +0100 +Subject: [PATCH 04/19] aarch64: Add bytes_below_saved_regs to frame info + +The frame layout code currently hard-codes the assumption that +the number of bytes below the saved registers is equal to the +size of the outgoing arguments. This patch abstracts that +value into a new field of aarch64_frame. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::bytes_below_saved_regs): New + field. + * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it, + and use it instead of crtl->outgoing_args_size. + (aarch64_get_separate_components): Use bytes_below_saved_regs instead + of outgoing_args_size. + (aarch64_process_components): Likewise. +--- + gcc/config/aarch64/aarch64.cc | 71 ++++++++++++++++++----------------- + gcc/config/aarch64/aarch64.h | 5 +++ + 2 files changed, 41 insertions(+), 35 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index e1f21230c15..94e1b686584 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8217,6 +8217,8 @@ aarch64_layout_frame (void) + gcc_assert (crtl->is_leaf + || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED)); + ++ frame.bytes_below_saved_regs = crtl->outgoing_args_size; ++ + /* Now assign stack slots for the registers. Start with the predicate + registers, since predicate LDR and STR have a relatively small + offset range. These saves happen below the hard frame pointer. */ +@@ -8321,18 +8323,18 @@ aarch64_layout_frame (void) + + poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size; + +- poly_int64 above_outgoing_args ++ poly_int64 saved_regs_and_above + = aligned_upper_bound (varargs_and_saved_regs_size + + get_frame_size (), + STACK_BOUNDARY / BITS_PER_UNIT); + + frame.hard_fp_offset +- = above_outgoing_args - frame.below_hard_fp_saved_regs_size; ++ = saved_regs_and_above - frame.below_hard_fp_saved_regs_size; + + /* Both these values are already aligned. */ +- gcc_assert (multiple_p (crtl->outgoing_args_size, ++ gcc_assert (multiple_p (frame.bytes_below_saved_regs, + STACK_BOUNDARY / BITS_PER_UNIT)); +- frame.frame_size = above_outgoing_args + crtl->outgoing_args_size; ++ frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs; + + frame.locals_offset = frame.saved_varargs_size; + +@@ -8376,7 +8378,7 @@ aarch64_layout_frame (void) + else if (frame.wb_pop_candidate1 != INVALID_REGNUM) + max_push_offset = 256; + +- HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset; ++ HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset; + HOST_WIDE_INT const_saved_regs_size; + if (known_eq (frame.saved_regs_size, 0)) + frame.initial_adjust = frame.frame_size; +@@ -8384,31 +8386,31 @@ aarch64_layout_frame (void) + && const_size < max_push_offset + && known_eq (frame.hard_fp_offset, const_size)) + { +- /* Simple, small frame with no outgoing arguments: ++ /* Simple, small frame with no data below the saved registers. + + stp reg1, reg2, [sp, -frame_size]! + stp reg3, reg4, [sp, 16] */ + frame.callee_adjust = const_size; + } +- else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size) ++ else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs) + && frame.saved_regs_size.is_constant (&const_saved_regs_size) +- && const_outgoing_args_size + const_saved_regs_size < 512 +- /* We could handle this case even with outgoing args, provided +- that the number of args left us with valid offsets for all +- predicate and vector save slots. It's such a rare case that +- it hardly seems worth the effort though. */ +- && (!saves_below_hard_fp_p || const_outgoing_args_size == 0) ++ && const_below_saved_regs + const_saved_regs_size < 512 ++ /* We could handle this case even with data below the saved ++ registers, provided that that data left us with valid offsets ++ for all predicate and vector save slots. It's such a rare ++ case that it hardly seems worth the effort though. */ ++ && (!saves_below_hard_fp_p || const_below_saved_regs == 0) + && !(cfun->calls_alloca + && frame.hard_fp_offset.is_constant (&const_fp_offset) + && const_fp_offset < max_push_offset)) + { +- /* Frame with small outgoing arguments: ++ /* Frame with small area below the saved registers: + + sub sp, sp, frame_size +- stp reg1, reg2, [sp, outgoing_args_size] +- stp reg3, reg4, [sp, outgoing_args_size + 16] */ ++ stp reg1, reg2, [sp, bytes_below_saved_regs] ++ stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */ + frame.initial_adjust = frame.frame_size; +- frame.callee_offset = const_outgoing_args_size; ++ frame.callee_offset = const_below_saved_regs; + } + else if (saves_below_hard_fp_p + && known_eq (frame.saved_regs_size, +@@ -8418,30 +8420,29 @@ aarch64_layout_frame (void) + + sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size + save SVE registers relative to SP +- sub sp, sp, outgoing_args_size */ ++ sub sp, sp, bytes_below_saved_regs */ + frame.initial_adjust = (frame.hard_fp_offset + + frame.below_hard_fp_saved_regs_size); +- frame.final_adjust = crtl->outgoing_args_size; ++ frame.final_adjust = frame.bytes_below_saved_regs; + } + else if (frame.hard_fp_offset.is_constant (&const_fp_offset) + && const_fp_offset < max_push_offset) + { +- /* Frame with large outgoing arguments or SVE saves, but with +- a small local area: ++ /* Frame with large area below the saved registers, or with SVE saves, ++ but with a small area above: + + stp reg1, reg2, [sp, -hard_fp_offset]! + stp reg3, reg4, [sp, 16] + [sub sp, sp, below_hard_fp_saved_regs_size] + [save SVE registers relative to SP] +- sub sp, sp, outgoing_args_size */ ++ sub sp, sp, bytes_below_saved_regs */ + frame.callee_adjust = const_fp_offset; + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; +- frame.final_adjust = crtl->outgoing_args_size; ++ frame.final_adjust = frame.bytes_below_saved_regs; + } + else + { +- /* Frame with large local area and outgoing arguments or SVE saves, +- using frame pointer: ++ /* General case: + + sub sp, sp, hard_fp_offset + stp x29, x30, [sp, 0] +@@ -8449,10 +8450,10 @@ aarch64_layout_frame (void) + stp reg3, reg4, [sp, 16] + [sub sp, sp, below_hard_fp_saved_regs_size] + [save SVE registers relative to SP] +- sub sp, sp, outgoing_args_size */ ++ sub sp, sp, bytes_below_saved_regs */ + frame.initial_adjust = frame.hard_fp_offset; + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; +- frame.final_adjust = crtl->outgoing_args_size; ++ frame.final_adjust = frame.bytes_below_saved_regs; + } + + /* Make sure the individual adjustments add up to the full frame size. */ +@@ -9043,7 +9044,7 @@ aarch64_get_separate_components (void) + if (frame_pointer_needed) + offset -= frame.below_hard_fp_saved_regs_size; + else +- offset += crtl->outgoing_args_size; ++ offset += frame.bytes_below_saved_regs; + + /* Check that we can access the stack slot of the register with one + direct load with no adjustments needed. */ +@@ -9192,7 +9193,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) + if (frame_pointer_needed) + offset -= frame.below_hard_fp_saved_regs_size; + else +- offset += crtl->outgoing_args_size; ++ offset += frame.bytes_below_saved_regs; + + rtx addr = plus_constant (Pmode, ptr_reg, offset); + rtx mem = gen_frame_mem (mode, addr); +@@ -9246,7 +9247,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) + if (frame_pointer_needed) + offset2 -= frame.below_hard_fp_saved_regs_size; + else +- offset2 += crtl->outgoing_args_size; ++ offset2 += frame.bytes_below_saved_regs; + rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); + rtx mem2 = gen_frame_mem (mode, addr2); + rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2) +@@ -9320,10 +9321,10 @@ aarch64_stack_clash_protection_alloca_probe_range (void) + registers. If POLY_SIZE is not large enough to require a probe this function + will only adjust the stack. When allocating the stack space + FRAME_RELATED_P is then used to indicate if the allocation is frame related. +- FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing +- arguments. If we are then we ensure that any allocation larger than the ABI +- defined buffer needs a probe so that the invariant of having a 1KB buffer is +- maintained. ++ FINAL_ADJUSTMENT_P indicates whether we are allocating the area below ++ the saved registers. If we are then we ensure that any allocation ++ larger than the ABI defined buffer needs a probe so that the ++ invariant of having a 1KB buffer is maintained. + + We emit barriers after each stack adjustment to prevent optimizations from + breaking the invariant that we never drop the stack more than a page. This +@@ -9532,7 +9533,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to + be probed. This maintains the requirement that each page is probed at + least once. For initial probing we probe only if the allocation is +- more than GUARD_SIZE - buffer, and for the outgoing arguments we probe ++ more than GUARD_SIZE - buffer, and below the saved registers we probe + if the amount is larger than buffer. GUARD_SIZE - buffer + buffer == + GUARD_SIZE. This works that for any allocation that is large enough to + trigger a probe here, we'll have at least one, and if they're not large +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index 6834c3e9922..1e105e12db8 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -871,6 +871,11 @@ struct GTY (()) aarch64_frame + /* The size of the callee-save registers with a slot in REG_OFFSET. */ + poly_int64 saved_regs_size; + ++ /* The number of bytes between the bottom of the static frame (the bottom ++ of the outgoing arguments) and the bottom of the register save area. ++ This value is always a multiple of STACK_BOUNDARY. */ ++ poly_int64 bytes_below_saved_regs; ++ + /* The size of the callee-save registers with a slot in REG_OFFSET that + are saved below the hard frame pointer. */ + poly_int64 below_hard_fp_saved_regs_size; +-- +2.34.1 + + +From 34081079ea4de0c98331843f574b5f6f94d7b234 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:50 +0100 +Subject: [PATCH 05/19] aarch64: Add bytes_below_hard_fp to frame info + +Following on from the previous bytes_below_saved_regs patch, this one +records the number of bytes that are below the hard frame pointer. +This eventually replaces below_hard_fp_saved_regs_size. + +If a frame pointer is not needed, the epilogue adds final_adjust +to the stack pointer before restoring registers: + + aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true); + +Therefore, if the epilogue needs to restore the stack pointer from +the hard frame pointer, the directly corresponding offset is: + + -bytes_below_hard_fp + final_adjust + +i.e. go from the hard frame pointer to the bottom of the frame, +then add the same amount as if we were using the stack pointer +from the outset. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::bytes_below_hard_fp): New + field. + * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize it. + (aarch64_expand_epilogue): Use it instead of + below_hard_fp_saved_regs_size. +--- + gcc/config/aarch64/aarch64.cc | 6 +++--- + gcc/config/aarch64/aarch64.h | 5 +++++ + 2 files changed, 8 insertions(+), 3 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 94e1b686584..c7d84245fbf 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8269,6 +8269,7 @@ aarch64_layout_frame (void) + of the callee save area. */ + bool saves_below_hard_fp_p = maybe_ne (offset, 0); + frame.below_hard_fp_saved_regs_size = offset; ++ frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs; + if (frame.emit_frame_chain) + { + /* FP and LR are placed in the linkage record. */ +@@ -9856,8 +9857,7 @@ aarch64_expand_epilogue (bool for_sibcall) + poly_int64 final_adjust = frame.final_adjust; + poly_int64 callee_offset = frame.callee_offset; + poly_int64 sve_callee_adjust = frame.sve_callee_adjust; +- poly_int64 below_hard_fp_saved_regs_size +- = frame.below_hard_fp_saved_regs_size; ++ poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp; + unsigned reg1 = frame.wb_pop_candidate1; + unsigned reg2 = frame.wb_pop_candidate2; + unsigned int last_gpr = (frame.is_scs_enabled +@@ -9915,7 +9915,7 @@ aarch64_expand_epilogue (bool for_sibcall) + is restored on the instruction doing the writeback. */ + aarch64_add_offset (Pmode, stack_pointer_rtx, + hard_frame_pointer_rtx, +- -callee_offset - below_hard_fp_saved_regs_size, ++ -bytes_below_hard_fp + final_adjust, + tmp1_rtx, tmp0_rtx, callee_adjust == 0); + else + /* The case where we need to re-use the register here is very rare, so +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index 1e105e12db8..de68ff7202f 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -880,6 +880,11 @@ struct GTY (()) aarch64_frame + are saved below the hard frame pointer. */ + poly_int64 below_hard_fp_saved_regs_size; + ++ /* The number of bytes between the bottom of the static frame (the bottom ++ of the outgoing arguments) and the hard frame pointer. This value is ++ always a multiple of STACK_BOUNDARY. */ ++ poly_int64 bytes_below_hard_fp; ++ + /* Offset from the base of the frame (incomming SP) to the + top of the locals area. This value is always a multiple of + STACK_BOUNDARY. */ +-- +2.34.1 + + +From 187861af7c51db9eddc6f954b589c121b210fc74 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:50 +0100 +Subject: [PATCH 06/19] aarch64: Tweak aarch64_save/restore_callee_saves + +aarch64_save_callee_saves and aarch64_restore_callee_saves took +a parameter called start_offset that gives the offset of the +bottom of the saved register area from the current stack pointer. +However, it's more convenient for later patches if we use the +bottom of the entire frame as the reference point, rather than +the bottom of the saved registers. + +Doing that removes the need for the callee_offset field. +Other than that, this is not a win on its own. It only really +makes sense in combination with the follow-on patches. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::callee_offset): Delete. + * config/aarch64/aarch64.cc (aarch64_layout_frame): Remove + callee_offset handling. + (aarch64_save_callee_saves): Replace the start_offset parameter + with a bytes_below_sp parameter. + (aarch64_restore_callee_saves): Likewise. + (aarch64_expand_prologue): Update accordingly. + (aarch64_expand_epilogue): Likewise. +--- + gcc/config/aarch64/aarch64.cc | 56 +++++++++++++++++------------------ + gcc/config/aarch64/aarch64.h | 4 --- + 2 files changed, 28 insertions(+), 32 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index c7d84245fbf..e79551af41d 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8343,7 +8343,6 @@ aarch64_layout_frame (void) + frame.final_adjust = 0; + frame.callee_adjust = 0; + frame.sve_callee_adjust = 0; +- frame.callee_offset = 0; + + frame.wb_pop_candidate1 = frame.wb_push_candidate1; + frame.wb_pop_candidate2 = frame.wb_push_candidate2; +@@ -8411,7 +8410,6 @@ aarch64_layout_frame (void) + stp reg1, reg2, [sp, bytes_below_saved_regs] + stp reg3, reg4, [sp, bytes_below_saved_regs + 16] */ + frame.initial_adjust = frame.frame_size; +- frame.callee_offset = const_below_saved_regs; + } + else if (saves_below_hard_fp_p + && known_eq (frame.saved_regs_size, +@@ -8758,12 +8756,13 @@ aarch64_add_cfa_expression (rtx_insn *insn, rtx reg, + } + + /* Emit code to save the callee-saved registers from register number START +- to LIMIT to the stack at the location starting at offset START_OFFSET, +- skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P +- is true if the hard frame pointer has been set up. */ ++ to LIMIT to the stack. The stack pointer is currently BYTES_BELOW_SP ++ bytes above the bottom of the static frame. Skip any write-back ++ candidates if SKIP_WB is true. HARD_FP_VALID_P is true if the hard ++ frame pointer has been set up. */ + + static void +-aarch64_save_callee_saves (poly_int64 start_offset, ++aarch64_save_callee_saves (poly_int64 bytes_below_sp, + unsigned start, unsigned limit, bool skip_wb, + bool hard_fp_valid_p) + { +@@ -8791,7 +8790,9 @@ aarch64_save_callee_saves (poly_int64 start_offset, + + machine_mode mode = aarch64_reg_save_mode (regno); + reg = gen_rtx_REG (mode, regno); +- offset = start_offset + frame.reg_offset[regno]; ++ offset = (frame.reg_offset[regno] ++ + frame.bytes_below_saved_regs ++ - bytes_below_sp); + rtx base_rtx = stack_pointer_rtx; + poly_int64 sp_offset = offset; + +@@ -8802,9 +8803,7 @@ aarch64_save_callee_saves (poly_int64 start_offset, + else if (GP_REGNUM_P (regno) + && (!offset.is_constant (&const_offset) || const_offset >= 512)) + { +- gcc_assert (known_eq (start_offset, 0)); +- poly_int64 fp_offset +- = frame.below_hard_fp_saved_regs_size; ++ poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp; + if (hard_fp_valid_p) + base_rtx = hard_frame_pointer_rtx; + else +@@ -8868,12 +8867,13 @@ aarch64_save_callee_saves (poly_int64 start_offset, + } + + /* Emit code to restore the callee registers from register number START +- up to and including LIMIT. Restore from the stack offset START_OFFSET, +- skipping any write-back candidates if SKIP_WB is true. Write the +- appropriate REG_CFA_RESTORE notes into CFI_OPS. */ ++ up to and including LIMIT. The stack pointer is currently BYTES_BELOW_SP ++ bytes above the bottom of the static frame. Skip any write-back ++ candidates if SKIP_WB is true. Write the appropriate REG_CFA_RESTORE ++ notes into CFI_OPS. */ + + static void +-aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, ++aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start, + unsigned limit, bool skip_wb, rtx *cfi_ops) + { + aarch64_frame &frame = cfun->machine->frame; +@@ -8899,7 +8899,9 @@ aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start, + + machine_mode mode = aarch64_reg_save_mode (regno); + reg = gen_rtx_REG (mode, regno); +- offset = start_offset + frame.reg_offset[regno]; ++ offset = (frame.reg_offset[regno] ++ + frame.bytes_below_saved_regs ++ - bytes_below_sp); + rtx base_rtx = stack_pointer_rtx; + if (mode == VNx2DImode && BYTES_BIG_ENDIAN) + aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, +@@ -9675,8 +9677,6 @@ aarch64_expand_prologue (void) + HOST_WIDE_INT callee_adjust = frame.callee_adjust; + poly_int64 final_adjust = frame.final_adjust; + poly_int64 sve_callee_adjust = frame.sve_callee_adjust; +- poly_int64 below_hard_fp_saved_regs_size +- = frame.below_hard_fp_saved_regs_size; + unsigned reg1 = frame.wb_push_candidate1; + unsigned reg2 = frame.wb_push_candidate2; + bool emit_frame_chain = frame.emit_frame_chain; +@@ -9752,8 +9752,8 @@ aarch64_expand_prologue (void) + - frame.hard_fp_offset); + gcc_assert (known_ge (chain_offset, 0)); + +- /* The offset of the bottom of the save area from the current SP. */ +- poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size; ++ /* The offset of the current SP from the bottom of the static frame. */ ++ poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust; + + if (emit_frame_chain) + { +@@ -9761,7 +9761,7 @@ aarch64_expand_prologue (void) + { + reg1 = R29_REGNUM; + reg2 = R30_REGNUM; +- aarch64_save_callee_saves (saved_regs_offset, reg1, reg2, ++ aarch64_save_callee_saves (bytes_below_sp, reg1, reg2, + false, false); + } + else +@@ -9801,7 +9801,7 @@ aarch64_expand_prologue (void) + emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx)); + } + +- aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM, ++ aarch64_save_callee_saves (bytes_below_sp, R0_REGNUM, R30_REGNUM, + callee_adjust != 0 || emit_frame_chain, + emit_frame_chain); + if (maybe_ne (sve_callee_adjust, 0)) +@@ -9811,16 +9811,17 @@ aarch64_expand_prologue (void) + aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, + sve_callee_adjust, + !frame_pointer_needed, false); +- saved_regs_offset += sve_callee_adjust; ++ bytes_below_sp -= sve_callee_adjust; + } +- aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM, ++ aarch64_save_callee_saves (bytes_below_sp, P0_REGNUM, P15_REGNUM, + false, emit_frame_chain); +- aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM, ++ aarch64_save_callee_saves (bytes_below_sp, V0_REGNUM, V31_REGNUM, + callee_adjust != 0 || emit_frame_chain, + emit_frame_chain); + + /* We may need to probe the final adjustment if it is larger than the guard + that is assumed by the called. */ ++ gcc_assert (known_eq (bytes_below_sp, final_adjust)); + aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust, + !frame_pointer_needed, true); + } +@@ -9855,7 +9856,6 @@ aarch64_expand_epilogue (bool for_sibcall) + poly_int64 initial_adjust = frame.initial_adjust; + HOST_WIDE_INT callee_adjust = frame.callee_adjust; + poly_int64 final_adjust = frame.final_adjust; +- poly_int64 callee_offset = frame.callee_offset; + poly_int64 sve_callee_adjust = frame.sve_callee_adjust; + poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp; + unsigned reg1 = frame.wb_pop_candidate1; +@@ -9925,9 +9925,9 @@ aarch64_expand_epilogue (bool for_sibcall) + + /* Restore the vector registers before the predicate registers, + so that we can use P4 as a temporary for big-endian SVE frames. */ +- aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM, ++ aarch64_restore_callee_saves (final_adjust, V0_REGNUM, V31_REGNUM, + callee_adjust != 0, &cfi_ops); +- aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM, ++ aarch64_restore_callee_saves (final_adjust, P0_REGNUM, P15_REGNUM, + false, &cfi_ops); + if (maybe_ne (sve_callee_adjust, 0)) + aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true); +@@ -9935,7 +9935,7 @@ aarch64_expand_epilogue (bool for_sibcall) + /* When shadow call stack is enabled, the scs_pop in the epilogue will + restore x30, we don't need to restore x30 again in the traditional + way. */ +- aarch64_restore_callee_saves (callee_offset - sve_callee_adjust, ++ aarch64_restore_callee_saves (final_adjust + sve_callee_adjust, + R0_REGNUM, last_gpr, + callee_adjust != 0, &cfi_ops); + +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index de68ff7202f..94fca4b9471 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -907,10 +907,6 @@ struct GTY (()) aarch64_frame + It is zero when no push is used. */ + HOST_WIDE_INT callee_adjust; + +- /* The offset from SP to the callee-save registers after initial_adjust. +- It may be non-zero if no push is used (ie. callee_adjust == 0). */ +- poly_int64 callee_offset; +- + /* The size of the stack adjustment before saving or after restoring + SVE registers. */ + poly_int64 sve_callee_adjust; +-- +2.34.1 + + +From 2b983f9064d808daf909bde1d4a13980934a7e6e Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:51 +0100 +Subject: [PATCH 07/19] aarch64: Only calculate chain_offset if there is a + chain + +After previous patches, it is no longer necessary to calculate +a chain_offset in cases where there is no chain record. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_expand_prologue): Move the + calculation of chain_offset into the emit_frame_chain block. +--- + gcc/config/aarch64/aarch64.cc | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index e79551af41d..d71a042d611 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -9747,16 +9747,16 @@ aarch64_expand_prologue (void) + if (callee_adjust != 0) + aarch64_push_regs (reg1, reg2, callee_adjust); + +- /* The offset of the frame chain record (if any) from the current SP. */ +- poly_int64 chain_offset = (initial_adjust + callee_adjust +- - frame.hard_fp_offset); +- gcc_assert (known_ge (chain_offset, 0)); +- + /* The offset of the current SP from the bottom of the static frame. */ + poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust; + + if (emit_frame_chain) + { ++ /* The offset of the frame chain record (if any) from the current SP. */ ++ poly_int64 chain_offset = (initial_adjust + callee_adjust ++ - frame.hard_fp_offset); ++ gcc_assert (known_ge (chain_offset, 0)); ++ + if (callee_adjust == 0) + { + reg1 = R29_REGNUM; +-- +2.34.1 + + +From 0a0a824808d1dec51004fb5805c1a0ae2a35433f Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:51 +0100 +Subject: [PATCH 08/19] aarch64: Rename locals_offset to bytes_above_locals +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +locals_offset was described as: + + /* Offset from the base of the frame (incomming SP) to the + top of the locals area. This value is always a multiple of + STACK_BOUNDARY. */ + +This is implicitly an “upside down” view of the frame: the incoming +SP is at offset 0, and anything N bytes below the incoming SP is at +offset N (rather than -N). + +However, reg_offset instead uses a “right way up” view; that is, +it views offsets in address terms. Something above X is at a +positive offset from X and something below X is at a negative +offset from X. + +Also, even on FRAME_GROWS_DOWNWARD targets like AArch64, +target-independent code views offsets in address terms too: +locals are allocated at negative offsets to virtual_stack_vars. + +It seems confusing to have *_offset fields of the same structure +using different polarities like this. This patch tries to avoid +that by renaming locals_offset to bytes_above_locals. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::locals_offset): Rename to... + (aarch64_frame::bytes_above_locals): ...this. + * config/aarch64/aarch64.cc (aarch64_layout_frame) + (aarch64_initial_elimination_offset): Update accordingly. +--- + gcc/config/aarch64/aarch64.cc | 6 +++--- + gcc/config/aarch64/aarch64.h | 6 +++--- + 2 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index d71a042d611..d4ec352ba98 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8337,7 +8337,7 @@ aarch64_layout_frame (void) + STACK_BOUNDARY / BITS_PER_UNIT)); + frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs; + +- frame.locals_offset = frame.saved_varargs_size; ++ frame.bytes_above_locals = frame.saved_varargs_size; + + frame.initial_adjust = 0; + frame.final_adjust = 0; +@@ -12578,13 +12578,13 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to) + return frame.hard_fp_offset; + + if (from == FRAME_POINTER_REGNUM) +- return frame.hard_fp_offset - frame.locals_offset; ++ return frame.hard_fp_offset - frame.bytes_above_locals; + } + + if (to == STACK_POINTER_REGNUM) + { + if (from == FRAME_POINTER_REGNUM) +- return frame.frame_size - frame.locals_offset; ++ return frame.frame_size - frame.bytes_above_locals; + } + + return frame.frame_size; +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index 94fca4b9471..bf46e6124aa 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -885,10 +885,10 @@ struct GTY (()) aarch64_frame + always a multiple of STACK_BOUNDARY. */ + poly_int64 bytes_below_hard_fp; + +- /* Offset from the base of the frame (incomming SP) to the +- top of the locals area. This value is always a multiple of ++ /* The number of bytes between the top of the locals area and the top ++ of the frame (the incomming SP). This value is always a multiple of + STACK_BOUNDARY. */ +- poly_int64 locals_offset; ++ poly_int64 bytes_above_locals; + + /* Offset from the base of the frame (incomming SP) to the + hard_frame_pointer. This value is always a multiple of +-- +2.34.1 + + +From 3fbf0789202b30a67b12e1fb785c7130f098d665 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:52 +0100 +Subject: [PATCH 09/19] aarch64: Rename hard_fp_offset to bytes_above_hard_fp +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Similarly to the previous locals_offset patch, hard_fp_offset +was described as: + + /* Offset from the base of the frame (incomming SP) to the + hard_frame_pointer. This value is always a multiple of + STACK_BOUNDARY. */ + poly_int64 hard_fp_offset; + +which again took an “upside-down” view: higher offsets meant lower +addresses. This patch renames the field to bytes_above_hard_fp instead. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::hard_fp_offset): Rename + to... + (aarch64_frame::bytes_above_hard_fp): ...this. + * config/aarch64/aarch64.cc (aarch64_layout_frame) + (aarch64_expand_prologue): Update accordingly. + (aarch64_initial_elimination_offset): Likewise. +--- + gcc/config/aarch64/aarch64.cc | 26 +++++++++++++------------- + gcc/config/aarch64/aarch64.h | 6 +++--- + 2 files changed, 16 insertions(+), 16 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index d4ec352ba98..3c4052740e7 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8329,7 +8329,7 @@ aarch64_layout_frame (void) + + get_frame_size (), + STACK_BOUNDARY / BITS_PER_UNIT); + +- frame.hard_fp_offset ++ frame.bytes_above_hard_fp + = saved_regs_and_above - frame.below_hard_fp_saved_regs_size; + + /* Both these values are already aligned. */ +@@ -8378,13 +8378,13 @@ aarch64_layout_frame (void) + else if (frame.wb_pop_candidate1 != INVALID_REGNUM) + max_push_offset = 256; + +- HOST_WIDE_INT const_size, const_below_saved_regs, const_fp_offset; ++ HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp; + HOST_WIDE_INT const_saved_regs_size; + if (known_eq (frame.saved_regs_size, 0)) + frame.initial_adjust = frame.frame_size; + else if (frame.frame_size.is_constant (&const_size) + && const_size < max_push_offset +- && known_eq (frame.hard_fp_offset, const_size)) ++ && known_eq (frame.bytes_above_hard_fp, const_size)) + { + /* Simple, small frame with no data below the saved registers. + +@@ -8401,8 +8401,8 @@ aarch64_layout_frame (void) + case that it hardly seems worth the effort though. */ + && (!saves_below_hard_fp_p || const_below_saved_regs == 0) + && !(cfun->calls_alloca +- && frame.hard_fp_offset.is_constant (&const_fp_offset) +- && const_fp_offset < max_push_offset)) ++ && frame.bytes_above_hard_fp.is_constant (&const_above_fp) ++ && const_above_fp < max_push_offset)) + { + /* Frame with small area below the saved registers: + +@@ -8420,12 +8420,12 @@ aarch64_layout_frame (void) + sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size + save SVE registers relative to SP + sub sp, sp, bytes_below_saved_regs */ +- frame.initial_adjust = (frame.hard_fp_offset ++ frame.initial_adjust = (frame.bytes_above_hard_fp + + frame.below_hard_fp_saved_regs_size); + frame.final_adjust = frame.bytes_below_saved_regs; + } +- else if (frame.hard_fp_offset.is_constant (&const_fp_offset) +- && const_fp_offset < max_push_offset) ++ else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp) ++ && const_above_fp < max_push_offset) + { + /* Frame with large area below the saved registers, or with SVE saves, + but with a small area above: +@@ -8435,7 +8435,7 @@ aarch64_layout_frame (void) + [sub sp, sp, below_hard_fp_saved_regs_size] + [save SVE registers relative to SP] + sub sp, sp, bytes_below_saved_regs */ +- frame.callee_adjust = const_fp_offset; ++ frame.callee_adjust = const_above_fp; + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; + frame.final_adjust = frame.bytes_below_saved_regs; + } +@@ -8450,7 +8450,7 @@ aarch64_layout_frame (void) + [sub sp, sp, below_hard_fp_saved_regs_size] + [save SVE registers relative to SP] + sub sp, sp, bytes_below_saved_regs */ +- frame.initial_adjust = frame.hard_fp_offset; ++ frame.initial_adjust = frame.bytes_above_hard_fp; + frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; + frame.final_adjust = frame.bytes_below_saved_regs; + } +@@ -9754,7 +9754,7 @@ aarch64_expand_prologue (void) + { + /* The offset of the frame chain record (if any) from the current SP. */ + poly_int64 chain_offset = (initial_adjust + callee_adjust +- - frame.hard_fp_offset); ++ - frame.bytes_above_hard_fp); + gcc_assert (known_ge (chain_offset, 0)); + + if (callee_adjust == 0) +@@ -12575,10 +12575,10 @@ aarch64_initial_elimination_offset (unsigned from, unsigned to) + if (to == HARD_FRAME_POINTER_REGNUM) + { + if (from == ARG_POINTER_REGNUM) +- return frame.hard_fp_offset; ++ return frame.bytes_above_hard_fp; + + if (from == FRAME_POINTER_REGNUM) +- return frame.hard_fp_offset - frame.bytes_above_locals; ++ return frame.bytes_above_hard_fp - frame.bytes_above_locals; + } + + if (to == STACK_POINTER_REGNUM) +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index bf46e6124aa..dd1f403f939 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -890,10 +890,10 @@ struct GTY (()) aarch64_frame + STACK_BOUNDARY. */ + poly_int64 bytes_above_locals; + +- /* Offset from the base of the frame (incomming SP) to the +- hard_frame_pointer. This value is always a multiple of ++ /* The number of bytes between the hard_frame_pointer and the top of ++ the frame (the incomming SP). This value is always a multiple of + STACK_BOUNDARY. */ +- poly_int64 hard_fp_offset; ++ poly_int64 bytes_above_hard_fp; + + /* The size of the frame. This value is the offset from base of the + frame (incomming SP) to the stack_pointer. This value is always +-- +2.34.1 + + +From aac8b31379ac3bbd14fc6427dce23f56e54e8485 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:52 +0100 +Subject: [PATCH 10/19] aarch64: Tweak frame_size comment +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This patch fixes another case in which a value was described with +an “upside-down” view. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::frame_size): Tweak comment. +--- + gcc/config/aarch64/aarch64.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index dd1f403f939..700524ae22b 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -895,8 +895,8 @@ struct GTY (()) aarch64_frame + STACK_BOUNDARY. */ + poly_int64 bytes_above_hard_fp; + +- /* The size of the frame. This value is the offset from base of the +- frame (incomming SP) to the stack_pointer. This value is always ++ /* The size of the frame, i.e. the number of bytes between the bottom ++ of the outgoing arguments and the incoming SP. This value is always + a multiple of STACK_BOUNDARY. */ + poly_int64 frame_size; + +-- +2.34.1 + + +From 8d5506a8aeb8dd7e8b209a3663b07688478f76b9 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:53 +0100 +Subject: [PATCH 11/19] aarch64: Measure reg_offset from the bottom of the + frame + +reg_offset was measured from the bottom of the saved register area. +This made perfect sense with the original layout, since the bottom +of the saved register area was also the hard frame pointer address. +It became slightly less obvious with SVE, since we save SVE +registers below the hard frame pointer, but it still made sense. + +However, if we want to allow different frame layouts, it's more +convenient and obvious to measure reg_offset from the bottom of +the frame. After previous patches, it's also a slight simplification +in its own right. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame): Add comment above + reg_offset. + * config/aarch64/aarch64.cc (aarch64_layout_frame): Walk offsets + from the bottom of the frame, rather than the bottom of the saved + register area. Measure reg_offset from the bottom of the frame + rather than the bottom of the saved register area. + (aarch64_save_callee_saves): Update accordingly. + (aarch64_restore_callee_saves): Likewise. + (aarch64_get_separate_components): Likewise. + (aarch64_process_components): Likewise. +--- + gcc/config/aarch64/aarch64.cc | 53 ++++++++++++++++------------------- + gcc/config/aarch64/aarch64.h | 3 ++ + 2 files changed, 27 insertions(+), 29 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 3c4052740e7..97dd077844b 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8139,7 +8139,6 @@ aarch64_needs_frame_chain (void) + static void + aarch64_layout_frame (void) + { +- poly_int64 offset = 0; + int regno, last_fp_reg = INVALID_REGNUM; + machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM); + poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode); +@@ -8217,7 +8216,9 @@ aarch64_layout_frame (void) + gcc_assert (crtl->is_leaf + || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED)); + +- frame.bytes_below_saved_regs = crtl->outgoing_args_size; ++ poly_int64 offset = crtl->outgoing_args_size; ++ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); ++ frame.bytes_below_saved_regs = offset; + + /* Now assign stack slots for the registers. Start with the predicate + registers, since predicate LDR and STR have a relatively small +@@ -8229,7 +8230,8 @@ aarch64_layout_frame (void) + offset += BYTES_PER_SVE_PRED; + } + +- if (maybe_ne (offset, 0)) ++ poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs; ++ if (maybe_ne (saved_prs_size, 0)) + { + /* If we have any vector registers to save above the predicate registers, + the offset of the vector register save slots need to be a multiple +@@ -8247,10 +8249,10 @@ aarch64_layout_frame (void) + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); + else + { +- if (known_le (offset, vector_save_size)) +- offset = vector_save_size; +- else if (known_le (offset, vector_save_size * 2)) +- offset = vector_save_size * 2; ++ if (known_le (saved_prs_size, vector_save_size)) ++ offset = frame.bytes_below_saved_regs + vector_save_size; ++ else if (known_le (saved_prs_size, vector_save_size * 2)) ++ offset = frame.bytes_below_saved_regs + vector_save_size * 2; + else + gcc_unreachable (); + } +@@ -8267,9 +8269,10 @@ aarch64_layout_frame (void) + + /* OFFSET is now the offset of the hard frame pointer from the bottom + of the callee save area. */ +- bool saves_below_hard_fp_p = maybe_ne (offset, 0); +- frame.below_hard_fp_saved_regs_size = offset; +- frame.bytes_below_hard_fp = offset + frame.bytes_below_saved_regs; ++ frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs; ++ bool saves_below_hard_fp_p ++ = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); ++ frame.bytes_below_hard_fp = offset; + if (frame.emit_frame_chain) + { + /* FP and LR are placed in the linkage record. */ +@@ -8320,9 +8323,10 @@ aarch64_layout_frame (void) + + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); + +- frame.saved_regs_size = offset; ++ frame.saved_regs_size = offset - frame.bytes_below_saved_regs; + +- poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size; ++ poly_int64 varargs_and_saved_regs_size ++ = frame.saved_regs_size + frame.saved_varargs_size; + + poly_int64 saved_regs_and_above + = aligned_upper_bound (varargs_and_saved_regs_size +@@ -8790,9 +8794,7 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp, + + machine_mode mode = aarch64_reg_save_mode (regno); + reg = gen_rtx_REG (mode, regno); +- offset = (frame.reg_offset[regno] +- + frame.bytes_below_saved_regs +- - bytes_below_sp); ++ offset = frame.reg_offset[regno] - bytes_below_sp; + rtx base_rtx = stack_pointer_rtx; + poly_int64 sp_offset = offset; + +@@ -8899,9 +8901,7 @@ aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start, + + machine_mode mode = aarch64_reg_save_mode (regno); + reg = gen_rtx_REG (mode, regno); +- offset = (frame.reg_offset[regno] +- + frame.bytes_below_saved_regs +- - bytes_below_sp); ++ offset = frame.reg_offset[regno] - bytes_below_sp; + rtx base_rtx = stack_pointer_rtx; + if (mode == VNx2DImode && BYTES_BIG_ENDIAN) + aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, +@@ -9040,14 +9040,12 @@ aarch64_get_separate_components (void) + it as a stack probe for -fstack-clash-protection. */ + if (flag_stack_clash_protection + && maybe_ne (frame.below_hard_fp_saved_regs_size, 0) +- && known_eq (offset, 0)) ++ && known_eq (offset, frame.bytes_below_saved_regs)) + continue; + + /* Get the offset relative to the register we'll use. */ + if (frame_pointer_needed) +- offset -= frame.below_hard_fp_saved_regs_size; +- else +- offset += frame.bytes_below_saved_regs; ++ offset -= frame.bytes_below_hard_fp; + + /* Check that we can access the stack slot of the register with one + direct load with no adjustments needed. */ +@@ -9194,9 +9192,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) + rtx reg = gen_rtx_REG (mode, regno); + poly_int64 offset = frame.reg_offset[regno]; + if (frame_pointer_needed) +- offset -= frame.below_hard_fp_saved_regs_size; +- else +- offset += frame.bytes_below_saved_regs; ++ offset -= frame.bytes_below_hard_fp; + + rtx addr = plus_constant (Pmode, ptr_reg, offset); + rtx mem = gen_frame_mem (mode, addr); +@@ -9248,9 +9244,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) + /* REGNO2 can be saved/restored in a pair with REGNO. */ + rtx reg2 = gen_rtx_REG (mode, regno2); + if (frame_pointer_needed) +- offset2 -= frame.below_hard_fp_saved_regs_size; +- else +- offset2 += frame.bytes_below_saved_regs; ++ offset2 -= frame.bytes_below_hard_fp; + rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); + rtx mem2 = gen_frame_mem (mode, addr2); + rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2) +@@ -9366,7 +9360,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + if (final_adjustment_p + && known_eq (frame.below_hard_fp_saved_regs_size, 0)) + { +- poly_int64 lr_offset = frame.reg_offset[LR_REGNUM]; ++ poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM] ++ - frame.bytes_below_saved_regs); + if (known_ge (lr_offset, 0)) + min_probe_threshold -= lr_offset.to_constant (); + else +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index 700524ae22b..b6135837073 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -860,6 +860,9 @@ extern enum aarch64_processor aarch64_tune; + #ifdef HAVE_POLY_INT_H + struct GTY (()) aarch64_frame + { ++ /* The offset from the bottom of the static frame (the bottom of the ++ outgoing arguments) of each register save slot, or -2 if no save is ++ needed. */ + poly_int64 reg_offset[LAST_SAVED_REGNUM + 1]; + + /* The number of extra stack bytes taken up by register varargs. +-- +2.34.1 + + +From b47766614df3b9df878262efb2ad73aaac108363 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:53 +0100 +Subject: [PATCH 12/19] aarch64: Simplify top of frame allocation + +After previous patches, it no longer really makes sense to allocate +the top of the frame in terms of varargs_and_saved_regs_size and +saved_regs_and_above. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_layout_frame): Simplify + the allocation of the top of the frame. +--- + gcc/config/aarch64/aarch64.cc | 23 ++++++++--------------- + 1 file changed, 8 insertions(+), 15 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 97dd077844b..81935852d5b 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8325,23 +8325,16 @@ aarch64_layout_frame (void) + + frame.saved_regs_size = offset - frame.bytes_below_saved_regs; + +- poly_int64 varargs_and_saved_regs_size +- = frame.saved_regs_size + frame.saved_varargs_size; +- +- poly_int64 saved_regs_and_above +- = aligned_upper_bound (varargs_and_saved_regs_size +- + get_frame_size (), +- STACK_BOUNDARY / BITS_PER_UNIT); +- +- frame.bytes_above_hard_fp +- = saved_regs_and_above - frame.below_hard_fp_saved_regs_size; ++ offset += get_frame_size (); ++ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); ++ auto top_of_locals = offset; + +- /* Both these values are already aligned. */ +- gcc_assert (multiple_p (frame.bytes_below_saved_regs, +- STACK_BOUNDARY / BITS_PER_UNIT)); +- frame.frame_size = saved_regs_and_above + frame.bytes_below_saved_regs; ++ offset += frame.saved_varargs_size; ++ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); ++ frame.frame_size = offset; + +- frame.bytes_above_locals = frame.saved_varargs_size; ++ frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp; ++ frame.bytes_above_locals = frame.frame_size - top_of_locals; + + frame.initial_adjust = 0; + frame.final_adjust = 0; +-- +2.34.1 + + +From 08f71b4bb28fb74d20e8d2927a557e8119ce9f4d Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:54 +0100 +Subject: [PATCH 13/19] aarch64: Minor initial adjustment tweak + +This patch just changes a calculation of initial_adjust +to one that makes it slightly more obvious that the total +adjustment is frame.frame_size. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_layout_frame): Tweak + calculation of initial_adjust for frames in which all saves + are SVE saves. +--- + gcc/config/aarch64/aarch64.cc | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 81935852d5b..4d9fcf3d162 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8414,11 +8414,10 @@ aarch64_layout_frame (void) + { + /* Frame in which all saves are SVE saves: + +- sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size ++ sub sp, sp, frame_size - bytes_below_saved_regs + save SVE registers relative to SP + sub sp, sp, bytes_below_saved_regs */ +- frame.initial_adjust = (frame.bytes_above_hard_fp +- + frame.below_hard_fp_saved_regs_size); ++ frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs; + frame.final_adjust = frame.bytes_below_saved_regs; + } + else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp) +-- +2.34.1 + + +From f22315d5c19e8310e4dc880fd509678fd291fca8 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:54 +0100 +Subject: [PATCH 14/19] aarch64: Tweak stack clash boundary condition + +The AArch64 ABI says that, when stack clash protection is used, +there can be a maximum of 1KiB of unprobed space at sp on entry +to a function. Therefore, we need to probe when allocating +>= guard_size - 1KiB of data (>= rather than >). This is what +GCC does. + +If an allocation is exactly guard_size bytes, it is enough to allocate +those bytes and probe once at offset 1024. It isn't possible to use a +single probe at any other offset: higher would conmplicate later code, +by leaving more unprobed space than usual, while lower would risk +leaving an entire page unprobed. For simplicity, the code probes all +allocations at offset 1024. + +Some register saves also act as probes. If we need to allocate +more space below the last such register save probe, we need to +probe the allocation if it is > 1KiB. Again, this allocation is +then sometimes (but not always) probed at offset 1024. This sort of +allocation is currently only used for outgoing arguments, which are +rarely this big. + +However, the code also probed if this final outgoing-arguments +allocation was == 1KiB, rather than just > 1KiB. This isn't +necessary, since the register save then probes at offset 1024 +as required. Continuing to probe allocations of exactly 1KiB +would complicate later patches. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_allocate_and_probe_stack_space): + Don't probe final allocations that are exactly 1KiB in size (after + unprobed space above the final allocation has been deducted). + +gcc/testsuite/ + * gcc.target/aarch64/stack-check-prologue-17.c: New test. +--- + gcc/config/aarch64/aarch64.cc | 4 +- + .../aarch64/stack-check-prologue-17.c | 55 +++++++++++++++++++ + 2 files changed, 58 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 4d9fcf3d162..34c1d8614cd 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -9333,9 +9333,11 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + HOST_WIDE_INT guard_size + = 1 << param_stack_clash_protection_guard_size; + HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD; ++ HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT; ++ gcc_assert (multiple_p (poly_size, byte_sp_alignment)); + HOST_WIDE_INT min_probe_threshold + = (final_adjustment_p +- ? guard_used_by_caller ++ ? guard_used_by_caller + byte_sp_alignment + : guard_size - guard_used_by_caller); + /* When doing the final adjustment for the outgoing arguments, take into + account any unprobed space there is above the current SP. There are +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c +new file mode 100644 +index 00000000000..0d8a25d73a2 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c +@@ -0,0 +1,55 @@ ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */ ++/* { dg-final { check-function-bodies "**" "" } } */ ++ ++void f(int, ...); ++void g(); ++ ++/* ++** test1: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #1024 ++** cbnz w0, .* ++** bl g ++** ... ++*/ ++int test1(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); ++ } ++ g(); ++ return 1; ++} ++ ++/* ++** test2: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #1040 ++** str xzr, \[sp\] ++** cbnz w0, .* ++** bl g ++** ... ++*/ ++int test2(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x); ++ } ++ g(); ++ return 1; ++} +-- +2.34.1 + + +From 15e18831bf98fd25af098b970ebf0c9a6200a34b Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:55 +0100 +Subject: [PATCH 15/19] aarch64: Put LR save probe in first 16 bytes + +-fstack-clash-protection uses the save of LR as a probe for the next +allocation. The next allocation could be: + +* another part of the static frame, e.g. when allocating SVE save slots + or outgoing arguments + +* an alloca in the same function + +* an allocation made by a callee function + +However, when -fomit-frame-pointer is used, the LR save slot is placed +above the other GPR save slots. It could therefore be up to 80 bytes +above the base of the GPR save area (which is also the hard fp address). + +aarch64_allocate_and_probe_stack_space took this into account when +deciding how much subsequent space could be allocated without needing +a probe. However, it interacted badly with: + + /* If doing a small final adjustment, we always probe at offset 0. + This is done to avoid issues when LR is not at position 0 or when + the final adjustment is smaller than the probing offset. */ + else if (final_adjustment_p && rounded_size == 0) + residual_probe_offset = 0; + +which forces any allocation that is smaller than the guard page size +to be probed at offset 0 rather than the usual offset 1024. It was +therefore possible to construct cases in which we had: + +* a probe using LR at SP + 80 bytes (or some other value >= 16) +* an allocation of the guard page size - 16 bytes +* a probe at SP + 0 + +which allocates guard page size + 64 consecutive unprobed bytes. + +This patch requires the LR probe to be in the first 16 bytes of the +save area when stack clash protection is active. Doing it +unconditionally would cause code-quality regressions. + +Putting LR before other registers prevents push/pop allocation +when shadow call stacks are enabled, since LR is restored +separately from the other callee-saved registers. + +The new comment doesn't say that the probe register is required +to be LR, since a later patch removes that restriction. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_layout_frame): Ensure that + the LR save slot is in the first 16 bytes of the register save area. + Only form STP/LDP push/pop candidates if both registers are valid. + (aarch64_allocate_and_probe_stack_space): Remove workaround for + when LR was not in the first 16 bytes. + +gcc/testsuite/ + * gcc.target/aarch64/stack-check-prologue-18.c: New test. + * gcc.target/aarch64/stack-check-prologue-19.c: Likewise. + * gcc.target/aarch64/stack-check-prologue-20.c: Likewise. +--- + gcc/config/aarch64/aarch64.cc | 72 ++++++------- + .../aarch64/stack-check-prologue-18.c | 100 ++++++++++++++++++ + .../aarch64/stack-check-prologue-19.c | 100 ++++++++++++++++++ + .../aarch64/stack-check-prologue-20.c | 3 + + 4 files changed, 233 insertions(+), 42 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 34c1d8614cd..16433fb70f4 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8273,26 +8273,34 @@ aarch64_layout_frame (void) + bool saves_below_hard_fp_p + = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); + frame.bytes_below_hard_fp = offset; ++ ++ auto allocate_gpr_slot = [&](unsigned int regno) ++ { ++ frame.reg_offset[regno] = offset; ++ if (frame.wb_push_candidate1 == INVALID_REGNUM) ++ frame.wb_push_candidate1 = regno; ++ else if (frame.wb_push_candidate2 == INVALID_REGNUM) ++ frame.wb_push_candidate2 = regno; ++ offset += UNITS_PER_WORD; ++ }; ++ + if (frame.emit_frame_chain) + { + /* FP and LR are placed in the linkage record. */ +- frame.reg_offset[R29_REGNUM] = offset; +- frame.wb_push_candidate1 = R29_REGNUM; +- frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD; +- frame.wb_push_candidate2 = R30_REGNUM; +- offset += 2 * UNITS_PER_WORD; ++ allocate_gpr_slot (R29_REGNUM); ++ allocate_gpr_slot (R30_REGNUM); + } ++ else if (flag_stack_clash_protection ++ && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED)) ++ /* Put the LR save slot first, since it makes a good choice of probe ++ for stack clash purposes. The idea is that the link register usually ++ has to be saved before a call anyway, and so we lose little by ++ stopping it from being individually shrink-wrapped. */ ++ allocate_gpr_slot (R30_REGNUM); + + for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++) + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) +- { +- frame.reg_offset[regno] = offset; +- if (frame.wb_push_candidate1 == INVALID_REGNUM) +- frame.wb_push_candidate1 = regno; +- else if (frame.wb_push_candidate2 == INVALID_REGNUM) +- frame.wb_push_candidate2 = regno; +- offset += UNITS_PER_WORD; +- } ++ allocate_gpr_slot (regno); + + poly_int64 max_int_offset = offset; + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); +@@ -8370,10 +8378,13 @@ aarch64_layout_frame (void) + max_push_offset to 0, because no registers are popped at this time, + so callee_adjust cannot be adjusted. */ + HOST_WIDE_INT max_push_offset = 0; +- if (frame.wb_pop_candidate2 != INVALID_REGNUM) +- max_push_offset = 512; +- else if (frame.wb_pop_candidate1 != INVALID_REGNUM) +- max_push_offset = 256; ++ if (frame.wb_pop_candidate1 != INVALID_REGNUM) ++ { ++ if (frame.wb_pop_candidate2 != INVALID_REGNUM) ++ max_push_offset = 512; ++ else ++ max_push_offset = 256; ++ } + + HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp; + HOST_WIDE_INT const_saved_regs_size; +@@ -9339,29 +9350,6 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + = (final_adjustment_p + ? guard_used_by_caller + byte_sp_alignment + : guard_size - guard_used_by_caller); +- /* When doing the final adjustment for the outgoing arguments, take into +- account any unprobed space there is above the current SP. There are +- two cases: +- +- - When saving SVE registers below the hard frame pointer, we force +- the lowest save to take place in the prologue before doing the final +- adjustment (i.e. we don't allow the save to be shrink-wrapped). +- This acts as a probe at SP, so there is no unprobed space. +- +- - When there are no SVE register saves, we use the store of the link +- register as a probe. We can't assume that LR was saved at position 0 +- though, so treat any space below it as unprobed. */ +- if (final_adjustment_p +- && known_eq (frame.below_hard_fp_saved_regs_size, 0)) +- { +- poly_int64 lr_offset = (frame.reg_offset[LR_REGNUM] +- - frame.bytes_below_saved_regs); +- if (known_ge (lr_offset, 0)) +- min_probe_threshold -= lr_offset.to_constant (); +- else +- gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0)); +- } +- + poly_int64 frame_size = frame.frame_size; + + /* We should always have a positive probe threshold. */ +@@ -9541,8 +9529,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + if (final_adjustment_p && rounded_size != 0) + min_probe_threshold = 0; + /* If doing a small final adjustment, we always probe at offset 0. +- This is done to avoid issues when LR is not at position 0 or when +- the final adjustment is smaller than the probing offset. */ ++ This is done to avoid issues when the final adjustment is smaller ++ than the probing offset. */ + else if (final_adjustment_p && rounded_size == 0) + residual_probe_offset = 0; + +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c +new file mode 100644 +index 00000000000..82447d20fff +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c +@@ -0,0 +1,100 @@ ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12" } */ ++/* { dg-final { check-function-bodies "**" "" } } */ ++ ++void f(int, ...); ++void g(); ++ ++/* ++** test1: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #4064 ++** str xzr, \[sp\] ++** cbnz w0, .* ++** bl g ++** ... ++** str x26, \[sp, #?4128\] ++** ... ++*/ ++int test1(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ asm volatile ("" ::: ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x); ++ } ++ g(); ++ return 1; ++} ++ ++/* ++** test2: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #1040 ++** str xzr, \[sp\] ++** cbnz w0, .* ++** bl g ++** ... ++*/ ++int test2(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ asm volatile ("" ::: ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x); ++ } ++ g(); ++ return 1; ++} ++ ++/* ++** test3: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #1024 ++** cbnz w0, .* ++** bl g ++** ... ++*/ ++int test3(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ asm volatile ("" ::: ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); ++ } ++ g(); ++ return 1; ++} +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c +new file mode 100644 +index 00000000000..73ac3e4e4eb +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c +@@ -0,0 +1,100 @@ ++/* { dg-options "-O2 -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack -ffixed-x18" } */ ++/* { dg-final { check-function-bodies "**" "" } } */ ++ ++void f(int, ...); ++void g(); ++ ++/* ++** test1: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #4064 ++** str xzr, \[sp\] ++** cbnz w0, .* ++** bl g ++** ... ++** str x26, \[sp, #?4128\] ++** ... ++*/ ++int test1(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ asm volatile ("" ::: ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x); ++ } ++ g(); ++ return 1; ++} ++ ++/* ++** test2: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #1040 ++** str xzr, \[sp\] ++** cbnz w0, .* ++** bl g ++** ... ++*/ ++int test2(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ asm volatile ("" ::: ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x); ++ } ++ g(); ++ return 1; ++} ++ ++/* ++** test3: ++** ... ++** str x30, \[sp\] ++** sub sp, sp, #1024 ++** cbnz w0, .* ++** bl g ++** ... ++*/ ++int test3(int z) { ++ __uint128_t x = 0; ++ int y[0x400]; ++ if (z) ++ { ++ asm volatile ("" ::: ++ "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"); ++ f(0, 0, 0, 0, 0, 0, 0, &y, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, ++ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); ++ } ++ g(); ++ return 1; ++} +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c +new file mode 100644 +index 00000000000..690aae8dfd5 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-20.c +@@ -0,0 +1,3 @@ ++/* { dg-options "-O2 -fstack-protector-all -fstack-clash-protection -fomit-frame-pointer --param stack-clash-protection-guard-size=12 -fsanitize=shadow-call-stack -ffixed-x18" } */ ++ ++#include "stack-check-prologue-19.c" +-- +2.34.1 + + +From c4f0e121faa36342f1d21919e54a05ad841c4f86 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:55 +0100 +Subject: [PATCH 16/19] aarch64: Simplify probe of final frame allocation + +Previous patches ensured that the final frame allocation only needs +a probe when the size is strictly greater than 1KiB. It's therefore +safe to use the normal 1024 probe offset in all cases. + +The main motivation for doing this is to simplify the code and +remove the number of special cases. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_allocate_and_probe_stack_space): + Always probe the residual allocation at offset 1024, asserting + that that is in range. + +gcc/testsuite/ + * gcc.target/aarch64/stack-check-prologue-17.c: Expect the probe + to be at offset 1024 rather than offset 0. + * gcc.target/aarch64/stack-check-prologue-18.c: Likewise. + * gcc.target/aarch64/stack-check-prologue-19.c: Likewise. +--- + gcc/config/aarch64/aarch64.cc | 12 ++++-------- + .../gcc.target/aarch64/stack-check-prologue-17.c | 2 +- + .../gcc.target/aarch64/stack-check-prologue-18.c | 4 ++-- + .../gcc.target/aarch64/stack-check-prologue-19.c | 4 ++-- + 4 files changed, 9 insertions(+), 13 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 16433fb70f4..8abf3d7a1e2 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -9523,16 +9523,12 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + are still safe. */ + if (residual) + { +- HOST_WIDE_INT residual_probe_offset = guard_used_by_caller; ++ gcc_assert (guard_used_by_caller + byte_sp_alignment <= size); ++ + /* If we're doing final adjustments, and we've done any full page + allocations then any residual needs to be probed. */ + if (final_adjustment_p && rounded_size != 0) + min_probe_threshold = 0; +- /* If doing a small final adjustment, we always probe at offset 0. +- This is done to avoid issues when the final adjustment is smaller +- than the probing offset. */ +- else if (final_adjustment_p && rounded_size == 0) +- residual_probe_offset = 0; + + aarch64_sub_sp (temp1, temp2, residual, frame_related_p); + if (residual >= min_probe_threshold) +@@ -9543,8 +9539,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required." + "\n", residual); + +- emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, +- residual_probe_offset)); ++ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, ++ guard_used_by_caller)); + emit_insn (gen_blockage ()); + } + } +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c +index 0d8a25d73a2..f0ec1389771 100644 +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-17.c +@@ -33,7 +33,7 @@ int test1(int z) { + ** ... + ** str x30, \[sp\] + ** sub sp, sp, #1040 +-** str xzr, \[sp\] ++** str xzr, \[sp, #?1024\] + ** cbnz w0, .* + ** bl g + ** ... +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c +index 82447d20fff..6383bec5ebc 100644 +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-18.c +@@ -9,7 +9,7 @@ void g(); + ** ... + ** str x30, \[sp\] + ** sub sp, sp, #4064 +-** str xzr, \[sp\] ++** str xzr, \[sp, #?1024\] + ** cbnz w0, .* + ** bl g + ** ... +@@ -50,7 +50,7 @@ int test1(int z) { + ** ... + ** str x30, \[sp\] + ** sub sp, sp, #1040 +-** str xzr, \[sp\] ++** str xzr, \[sp, #?1024\] + ** cbnz w0, .* + ** bl g + ** ... +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c +index 73ac3e4e4eb..562039b5e9b 100644 +--- a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c ++++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-19.c +@@ -9,7 +9,7 @@ void g(); + ** ... + ** str x30, \[sp\] + ** sub sp, sp, #4064 +-** str xzr, \[sp\] ++** str xzr, \[sp, #?1024\] + ** cbnz w0, .* + ** bl g + ** ... +@@ -50,7 +50,7 @@ int test1(int z) { + ** ... + ** str x30, \[sp\] + ** sub sp, sp, #1040 +-** str xzr, \[sp\] ++** str xzr, \[sp, #?1024\] + ** cbnz w0, .* + ** bl g + ** ... +-- +2.34.1 + + +From 6f0ab0a9f46a17b68349ff6035aa776bf65f0575 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:56 +0100 +Subject: [PATCH 17/19] aarch64: Explicitly record probe registers in frame + info + +The stack frame is currently divided into three areas: + +A: the area above the hard frame pointer +B: the SVE saves below the hard frame pointer +C: the outgoing arguments + +If the stack frame is allocated in one chunk, the allocation needs a +probe if the frame size is >= guard_size - 1KiB. In addition, if the +function is not a leaf function, it must probe an address no more than +1KiB above the outgoing SP. We ensured the second condition by + +(1) using single-chunk allocations for non-leaf functions only if + the link register save slot is within 512 bytes of the bottom + of the frame; and + +(2) using the link register save as a probe (meaning, for instance, + that it can't be individually shrink wrapped) + +If instead the stack is allocated in multiple chunks, then: + +* an allocation involving only the outgoing arguments (C above) requires + a probe if the allocation size is > 1KiB + +* any other allocation requires a probe if the allocation size + is >= guard_size - 1KiB + +* second and subsequent allocations require the previous allocation + to probe at the bottom of the allocated area, regardless of the size + of that previous allocation + +The final point means that, unlike for single allocations, +it can be necessary to have both a non-SVE register probe and +an SVE register probe. For example: + +* allocate A, probe using a non-SVE register save +* allocate B, probe using an SVE register save +* allocate C + +The non-SVE register used in this case was again the link register. +It was previously used even if the link register save slot was some +bytes above the bottom of the non-SVE register saves, but an earlier +patch avoided that by putting the link register save slot first. + +As a belt-and-braces fix, this patch explicitly records which +probe registers we're using and allows the non-SVE probe to be +whichever register comes first (as for SVE). + +The patch also avoids unnecessary probes in sve/pcs/stack_clash_3.c. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::sve_save_and_probe) + (aarch64_frame::hard_fp_save_and_probe): New fields. + * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize them. + Rather than asserting that a leaf function saves LR, instead assert + that a leaf function saves something. + (aarch64_get_separate_components): Prevent the chosen probe + registers from being individually shrink-wrapped. + (aarch64_allocate_and_probe_stack_space): Remove workaround for + probe registers that aren't at the bottom of the previous allocation. + +gcc/testsuite/ + * gcc.target/aarch64/sve/pcs/stack_clash_3.c: Avoid redundant probes. +--- + gcc/config/aarch64/aarch64.cc | 68 +++++++++++++++---- + gcc/config/aarch64/aarch64.h | 8 +++ + .../aarch64/sve/pcs/stack_clash_3.c | 6 +- + 3 files changed, 64 insertions(+), 18 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 8abf3d7a1e2..a8d907df884 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8210,15 +8210,11 @@ aarch64_layout_frame (void) + && !crtl->abi->clobbers_full_reg_p (regno)) + frame.reg_offset[regno] = SLOT_REQUIRED; + +- /* With stack-clash, LR must be saved in non-leaf functions. The saving of +- LR counts as an implicit probe which allows us to maintain the invariant +- described in the comment at expand_prologue. */ +- gcc_assert (crtl->is_leaf +- || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED)); + + poly_int64 offset = crtl->outgoing_args_size; + gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); + frame.bytes_below_saved_regs = offset; ++ frame.sve_save_and_probe = INVALID_REGNUM; + + /* Now assign stack slots for the registers. Start with the predicate + registers, since predicate LDR and STR have a relatively small +@@ -8226,6 +8222,8 @@ aarch64_layout_frame (void) + for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++) + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) + { ++ if (frame.sve_save_and_probe == INVALID_REGNUM) ++ frame.sve_save_and_probe = regno; + frame.reg_offset[regno] = offset; + offset += BYTES_PER_SVE_PRED; + } +@@ -8263,6 +8261,8 @@ aarch64_layout_frame (void) + for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) + { ++ if (frame.sve_save_and_probe == INVALID_REGNUM) ++ frame.sve_save_and_probe = regno; + frame.reg_offset[regno] = offset; + offset += vector_save_size; + } +@@ -8272,10 +8272,18 @@ aarch64_layout_frame (void) + frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs; + bool saves_below_hard_fp_p + = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); ++ gcc_assert (!saves_below_hard_fp_p ++ || (frame.sve_save_and_probe != INVALID_REGNUM ++ && known_eq (frame.reg_offset[frame.sve_save_and_probe], ++ frame.bytes_below_saved_regs))); ++ + frame.bytes_below_hard_fp = offset; ++ frame.hard_fp_save_and_probe = INVALID_REGNUM; + + auto allocate_gpr_slot = [&](unsigned int regno) + { ++ if (frame.hard_fp_save_and_probe == INVALID_REGNUM) ++ frame.hard_fp_save_and_probe = regno; + frame.reg_offset[regno] = offset; + if (frame.wb_push_candidate1 == INVALID_REGNUM) + frame.wb_push_candidate1 = regno; +@@ -8309,6 +8317,8 @@ aarch64_layout_frame (void) + for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) + { ++ if (frame.hard_fp_save_and_probe == INVALID_REGNUM) ++ frame.hard_fp_save_and_probe = regno; + /* If there is an alignment gap between integer and fp callee-saves, + allocate the last fp register to it if possible. */ + if (regno == last_fp_reg +@@ -8332,6 +8342,17 @@ aarch64_layout_frame (void) + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); + + frame.saved_regs_size = offset - frame.bytes_below_saved_regs; ++ gcc_assert (known_eq (frame.saved_regs_size, ++ frame.below_hard_fp_saved_regs_size) ++ || (frame.hard_fp_save_and_probe != INVALID_REGNUM ++ && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe], ++ frame.bytes_below_hard_fp))); ++ ++ /* With stack-clash, a register must be saved in non-leaf functions. ++ The saving of the bottommost register counts as an implicit probe, ++ which allows us to maintain the invariant described in the comment ++ at expand_prologue. */ ++ gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0)); + + offset += get_frame_size (); + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); +@@ -8462,6 +8483,25 @@ aarch64_layout_frame (void) + frame.final_adjust = frame.bytes_below_saved_regs; + } + ++ /* The frame is allocated in pieces, with each non-final piece ++ including a register save at offset 0 that acts as a probe for ++ the following piece. In addition, the save of the bottommost register ++ acts as a probe for callees and allocas. Roll back any probes that ++ aren't needed. ++ ++ A probe isn't needed if it is associated with the final allocation ++ (including callees and allocas) that happens before the epilogue is ++ executed. */ ++ if (crtl->is_leaf ++ && !cfun->calls_alloca ++ && known_eq (frame.final_adjust, 0)) ++ { ++ if (maybe_ne (frame.sve_callee_adjust, 0)) ++ frame.sve_save_and_probe = INVALID_REGNUM; ++ else ++ frame.hard_fp_save_and_probe = INVALID_REGNUM; ++ } ++ + /* Make sure the individual adjustments add up to the full frame size. */ + gcc_assert (known_eq (frame.initial_adjust + + frame.callee_adjust +@@ -9039,13 +9079,6 @@ aarch64_get_separate_components (void) + + poly_int64 offset = frame.reg_offset[regno]; + +- /* If the register is saved in the first SVE save slot, we use +- it as a stack probe for -fstack-clash-protection. */ +- if (flag_stack_clash_protection +- && maybe_ne (frame.below_hard_fp_saved_regs_size, 0) +- && known_eq (offset, frame.bytes_below_saved_regs)) +- continue; +- + /* Get the offset relative to the register we'll use. */ + if (frame_pointer_needed) + offset -= frame.bytes_below_hard_fp; +@@ -9080,6 +9113,13 @@ aarch64_get_separate_components (void) + + bitmap_clear_bit (components, LR_REGNUM); + bitmap_clear_bit (components, SP_REGNUM); ++ if (flag_stack_clash_protection) ++ { ++ if (frame.sve_save_and_probe != INVALID_REGNUM) ++ bitmap_clear_bit (components, frame.sve_save_and_probe); ++ if (frame.hard_fp_save_and_probe != INVALID_REGNUM) ++ bitmap_clear_bit (components, frame.hard_fp_save_and_probe); ++ } + + return components; + } +@@ -9616,8 +9656,8 @@ aarch64_epilogue_uses (int regno) + When probing is needed, we emit a probe at the start of the prologue + and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter. + +- We have to track how much space has been allocated and the only stores +- to the stack we track as implicit probes are the FP/LR stores. ++ We can also use register saves as probes. These are stored in ++ sve_save_and_probe and hard_fp_save_and_probe. + + For outgoing arguments we probe if the size is larger than 1KB, such that + the ABI specified buffer is maintained for the next callee. +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index b6135837073..46d4693e206 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -957,6 +957,14 @@ struct GTY (()) aarch64_frame + This is the register they should use. */ + unsigned spare_pred_reg; + ++ /* An SVE register that is saved below the hard frame pointer and that acts ++ as a probe for later allocations, or INVALID_REGNUM if none. */ ++ unsigned sve_save_and_probe; ++ ++ /* A register that is saved at the hard frame pointer and that acts ++ as a probe for later allocations, or INVALID_REGNUM if none. */ ++ unsigned hard_fp_save_and_probe; ++ + bool laid_out; + + /* True if shadow call stack should be enabled for the current function. */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c +index 3e01ec36c3a..3530a0d504b 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c +@@ -11,11 +11,10 @@ + ** mov x11, sp + ** ... + ** sub sp, sp, x13 +-** str p4, \[sp\] + ** cbz w0, [^\n]* ++** str p4, \[sp\] + ** ... + ** ptrue p0\.b, all +-** ldr p4, \[sp\] + ** addvl sp, sp, #1 + ** ldr x24, \[sp\], 32 + ** ret +@@ -39,13 +38,12 @@ test_1 (int n) + ** mov x11, sp + ** ... + ** sub sp, sp, x13 +-** str p4, \[sp\] + ** cbz w0, [^\n]* ++** str p4, \[sp\] + ** str p5, \[sp, #1, mul vl\] + ** str p6, \[sp, #2, mul vl\] + ** ... + ** ptrue p0\.b, all +-** ldr p4, \[sp\] + ** addvl sp, sp, #1 + ** ldr x24, \[sp\], 32 + ** ret +-- +2.34.1 + + +From 8254e1b9cd500e0c278465a3657543477e9d1250 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:56 +0100 +Subject: [PATCH 18/19] aarch64: Remove below_hard_fp_saved_regs_size + +After previous patches, it's no longer necessary to store +saved_regs_size and below_hard_fp_saved_regs_size in the frame info. +All measurements instead use the top or bottom of the frame as +reference points. + +gcc/ + * config/aarch64/aarch64.h (aarch64_frame::saved_regs_size) + (aarch64_frame::below_hard_fp_saved_regs_size): Delete. + * config/aarch64/aarch64.cc (aarch64_layout_frame): Update accordingly. +--- + gcc/config/aarch64/aarch64.cc | 45 ++++++++++++++++------------------- + gcc/config/aarch64/aarch64.h | 7 ------ + 2 files changed, 21 insertions(+), 31 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index a8d907df884..ac3d3b336a3 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8269,9 +8269,8 @@ aarch64_layout_frame (void) + + /* OFFSET is now the offset of the hard frame pointer from the bottom + of the callee save area. */ +- frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs; +- bool saves_below_hard_fp_p +- = maybe_ne (frame.below_hard_fp_saved_regs_size, 0); ++ auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs; ++ bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0); + gcc_assert (!saves_below_hard_fp_p + || (frame.sve_save_and_probe != INVALID_REGNUM + && known_eq (frame.reg_offset[frame.sve_save_and_probe], +@@ -8341,9 +8340,8 @@ aarch64_layout_frame (void) + + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); + +- frame.saved_regs_size = offset - frame.bytes_below_saved_regs; +- gcc_assert (known_eq (frame.saved_regs_size, +- frame.below_hard_fp_saved_regs_size) ++ auto saved_regs_size = offset - frame.bytes_below_saved_regs; ++ gcc_assert (known_eq (saved_regs_size, below_hard_fp_saved_regs_size) + || (frame.hard_fp_save_and_probe != INVALID_REGNUM + && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe], + frame.bytes_below_hard_fp))); +@@ -8352,7 +8350,7 @@ aarch64_layout_frame (void) + The saving of the bottommost register counts as an implicit probe, + which allows us to maintain the invariant described in the comment + at expand_prologue. */ +- gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0)); ++ gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0)); + + offset += get_frame_size (); + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); +@@ -8409,7 +8407,7 @@ aarch64_layout_frame (void) + + HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp; + HOST_WIDE_INT const_saved_regs_size; +- if (known_eq (frame.saved_regs_size, 0)) ++ if (known_eq (saved_regs_size, 0)) + frame.initial_adjust = frame.frame_size; + else if (frame.frame_size.is_constant (&const_size) + && const_size < max_push_offset +@@ -8422,7 +8420,7 @@ aarch64_layout_frame (void) + frame.callee_adjust = const_size; + } + else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs) +- && frame.saved_regs_size.is_constant (&const_saved_regs_size) ++ && saved_regs_size.is_constant (&const_saved_regs_size) + && const_below_saved_regs + const_saved_regs_size < 512 + /* We could handle this case even with data below the saved + registers, provided that that data left us with valid offsets +@@ -8441,8 +8439,7 @@ aarch64_layout_frame (void) + frame.initial_adjust = frame.frame_size; + } + else if (saves_below_hard_fp_p +- && known_eq (frame.saved_regs_size, +- frame.below_hard_fp_saved_regs_size)) ++ && known_eq (saved_regs_size, below_hard_fp_saved_regs_size)) + { + /* Frame in which all saves are SVE saves: + +@@ -8464,7 +8461,7 @@ aarch64_layout_frame (void) + [save SVE registers relative to SP] + sub sp, sp, bytes_below_saved_regs */ + frame.callee_adjust = const_above_fp; +- frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; ++ frame.sve_callee_adjust = below_hard_fp_saved_regs_size; + frame.final_adjust = frame.bytes_below_saved_regs; + } + else +@@ -8479,7 +8476,7 @@ aarch64_layout_frame (void) + [save SVE registers relative to SP] + sub sp, sp, bytes_below_saved_regs */ + frame.initial_adjust = frame.bytes_above_hard_fp; +- frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size; ++ frame.sve_callee_adjust = below_hard_fp_saved_regs_size; + frame.final_adjust = frame.bytes_below_saved_regs; + } + +@@ -9621,17 +9618,17 @@ aarch64_epilogue_uses (int regno) + | local variables | <-- frame_pointer_rtx + | | + +-------------------------------+ +- | padding | \ +- +-------------------------------+ | +- | callee-saved registers | | frame.saved_regs_size +- +-------------------------------+ | +- | LR' | | +- +-------------------------------+ | +- | FP' | | +- +-------------------------------+ |<- hard_frame_pointer_rtx (aligned) +- | SVE vector registers | | \ +- +-------------------------------+ | | below_hard_fp_saved_regs_size +- | SVE predicate registers | / / ++ | padding | ++ +-------------------------------+ ++ | callee-saved registers | ++ +-------------------------------+ ++ | LR' | ++ +-------------------------------+ ++ | FP' | ++ +-------------------------------+ <-- hard_frame_pointer_rtx (aligned) ++ | SVE vector registers | ++ +-------------------------------+ ++ | SVE predicate registers | + +-------------------------------+ + | dynamic allocation | + +-------------------------------+ +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index 46d4693e206..01f7751bc78 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -871,18 +871,11 @@ struct GTY (()) aarch64_frame + STACK_BOUNDARY. */ + HOST_WIDE_INT saved_varargs_size; + +- /* The size of the callee-save registers with a slot in REG_OFFSET. */ +- poly_int64 saved_regs_size; +- + /* The number of bytes between the bottom of the static frame (the bottom + of the outgoing arguments) and the bottom of the register save area. + This value is always a multiple of STACK_BOUNDARY. */ + poly_int64 bytes_below_saved_regs; + +- /* The size of the callee-save registers with a slot in REG_OFFSET that +- are saved below the hard frame pointer. */ +- poly_int64 below_hard_fp_saved_regs_size; +- + /* The number of bytes between the bottom of the static frame (the bottom + of the outgoing arguments) and the hard frame pointer. This value is + always a multiple of STACK_BOUNDARY. */ +-- +2.34.1 + + +From 75c37e031408262263442f5b4cdb83d3777b6422 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 12 Sep 2023 16:08:57 +0100 +Subject: [PATCH 19/19] aarch64: Make stack smash canary protect saved + registers + +AArch64 normally puts the saved registers near the bottom of the frame, +immediately above any dynamic allocations. But this means that a +stack-smash attack on those dynamic allocations could overwrite the +saved registers without needing to reach as far as the stack smash +canary. + +The same thing could also happen for variable-sized arguments that are +passed by value, since those are allocated before a call and popped on +return. + +This patch avoids that by putting the locals (and thus the canary) below +the saved registers when stack smash protection is active. + +The patch fixes CVE-2023-4039. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_save_regs_above_locals_p): + New function. + (aarch64_layout_frame): Use it to decide whether locals should + go above or below the saved registers. + (aarch64_expand_prologue): Update stack layout comment. + Emit a stack tie after the final adjustment. + +gcc/testsuite/ + * gcc.target/aarch64/stack-protector-8.c: New test. + * gcc.target/aarch64/stack-protector-9.c: Likewise. +--- + gcc/config/aarch64/aarch64.cc | 46 +++++++-- + .../gcc.target/aarch64/stack-protector-8.c | 95 +++++++++++++++++++ + .../gcc.target/aarch64/stack-protector-9.c | 33 +++++++ + 3 files changed, 168 insertions(+), 6 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-8.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/stack-protector-9.c + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index ac3d3b336a3..96c3f48fdc4 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -8133,6 +8133,20 @@ aarch64_needs_frame_chain (void) + return aarch64_use_frame_pointer; + } + ++/* Return true if the current function should save registers above ++ the locals area, rather than below it. */ ++ ++static bool ++aarch64_save_regs_above_locals_p () ++{ ++ /* When using stack smash protection, make sure that the canary slot ++ comes between the locals and the saved registers. Otherwise, ++ it would be possible for a carefully sized smash attack to change ++ the saved registers (particularly LR and FP) without reaching the ++ canary. */ ++ return crtl->stack_protect_guard; ++} ++ + /* Mark the registers that need to be saved by the callee and calculate + the size of the callee-saved registers area and frame record (both FP + and LR may be omitted). */ +@@ -8144,6 +8158,7 @@ aarch64_layout_frame (void) + poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode); + bool frame_related_fp_reg_p = false; + aarch64_frame &frame = cfun->machine->frame; ++ poly_int64 top_of_locals = -1; + + frame.emit_frame_chain = aarch64_needs_frame_chain (); + +@@ -8210,9 +8225,16 @@ aarch64_layout_frame (void) + && !crtl->abi->clobbers_full_reg_p (regno)) + frame.reg_offset[regno] = SLOT_REQUIRED; + ++ bool regs_at_top_p = aarch64_save_regs_above_locals_p (); + + poly_int64 offset = crtl->outgoing_args_size; + gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); ++ if (regs_at_top_p) ++ { ++ offset += get_frame_size (); ++ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); ++ top_of_locals = offset; ++ } + frame.bytes_below_saved_regs = offset; + frame.sve_save_and_probe = INVALID_REGNUM; + +@@ -8352,15 +8374,18 @@ aarch64_layout_frame (void) + at expand_prologue. */ + gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0)); + +- offset += get_frame_size (); +- offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); +- auto top_of_locals = offset; +- ++ if (!regs_at_top_p) ++ { ++ offset += get_frame_size (); ++ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); ++ top_of_locals = offset; ++ } + offset += frame.saved_varargs_size; + gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT)); + frame.frame_size = offset; + + frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp; ++ gcc_assert (known_ge (top_of_locals, 0)); + frame.bytes_above_locals = frame.frame_size - top_of_locals; + + frame.initial_adjust = 0; +@@ -9615,10 +9640,10 @@ aarch64_epilogue_uses (int regno) + | for register varargs | + | | + +-------------------------------+ +- | local variables | <-- frame_pointer_rtx ++ | local variables (1) | <-- frame_pointer_rtx + | | + +-------------------------------+ +- | padding | ++ | padding (1) | + +-------------------------------+ + | callee-saved registers | + +-------------------------------+ +@@ -9630,6 +9655,10 @@ aarch64_epilogue_uses (int regno) + +-------------------------------+ + | SVE predicate registers | + +-------------------------------+ ++ | local variables (2) | ++ +-------------------------------+ ++ | padding (2) | ++ +-------------------------------+ + | dynamic allocation | + +-------------------------------+ + | padding | +@@ -9639,6 +9668,9 @@ aarch64_epilogue_uses (int regno) + +-------------------------------+ + | | <-- stack_pointer_rtx (aligned) + ++ The regions marked (1) and (2) are mutually exclusive. (2) is used ++ when aarch64_save_regs_above_locals_p is true. ++ + Dynamic stack allocations via alloca() decrease stack_pointer_rtx + but leave frame_pointer_rtx and hard_frame_pointer_rtx + unchanged. +@@ -9834,6 +9866,8 @@ aarch64_expand_prologue (void) + gcc_assert (known_eq (bytes_below_sp, final_adjust)); + aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust, + !frame_pointer_needed, true); ++ if (emit_frame_chain && maybe_ne (final_adjust, 0)) ++ emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx)); + } + + /* Return TRUE if we can use a simple_return insn. +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c +new file mode 100644 +index 00000000000..e71d820e365 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-8.c +@@ -0,0 +1,95 @@ ++/* { dg-options " -O -fstack-protector-strong -mstack-protector-guard=sysreg -mstack-protector-guard-reg=tpidr2_el0 -mstack-protector-guard-offset=16" } */ ++/* { dg-final { check-function-bodies "**" "" } } */ ++ ++void g(void *); ++__SVBool_t *h(void *); ++ ++/* ++** test1: ++** sub sp, sp, #288 ++** stp x29, x30, \[sp, #?272\] ++** add x29, sp, #?272 ++** mrs (x[0-9]+), tpidr2_el0 ++** ldr (x[0-9]+), \[\1, #?16\] ++** str \2, \[sp, #?264\] ++** mov \2, #?0 ++** add x0, sp, #?8 ++** bl g ++** ... ++** mrs .* ++** ... ++** bne .* ++** ... ++** ldp x29, x30, \[sp, #?272\] ++** add sp, sp, #?288 ++** ret ++** bl __stack_chk_fail ++*/ ++int test1() { ++ int y[0x40]; ++ g(y); ++ return 1; ++} ++ ++/* ++** test2: ++** stp x29, x30, \[sp, #?-16\]! ++** mov x29, sp ++** sub sp, sp, #1040 ++** mrs (x[0-9]+), tpidr2_el0 ++** ldr (x[0-9]+), \[\1, #?16\] ++** str \2, \[sp, #?1032\] ++** mov \2, #?0 ++** add x0, sp, #?8 ++** bl g ++** ... ++** mrs .* ++** ... ++** bne .* ++** ... ++** add sp, sp, #?1040 ++** ldp x29, x30, \[sp\], #?16 ++** ret ++** bl __stack_chk_fail ++*/ ++int test2() { ++ int y[0x100]; ++ g(y); ++ return 1; ++} ++ ++#pragma GCC target "+sve" ++ ++/* ++** test3: ++** stp x29, x30, \[sp, #?-16\]! ++** mov x29, sp ++** addvl sp, sp, #-18 ++** ... ++** str p4, \[sp\] ++** ... ++** sub sp, sp, #272 ++** mrs (x[0-9]+), tpidr2_el0 ++** ldr (x[0-9]+), \[\1, #?16\] ++** str \2, \[sp, #?264\] ++** mov \2, #?0 ++** add x0, sp, #?8 ++** bl h ++** ... ++** mrs .* ++** ... ++** bne .* ++** ... ++** add sp, sp, #?272 ++** ... ++** ldr p4, \[sp\] ++** ... ++** addvl sp, sp, #18 ++** ldp x29, x30, \[sp\], #?16 ++** ret ++** bl __stack_chk_fail ++*/ ++__SVBool_t test3() { ++ int y[0x40]; ++ return *h(y); ++} +diff --git a/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c +new file mode 100644 +index 00000000000..58f322aa480 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/stack-protector-9.c +@@ -0,0 +1,33 @@ ++/* { dg-options "-O2 -mcpu=neoverse-v1 -fstack-protector-all" } */ ++/* { dg-final { check-function-bodies "**" "" } } */ ++ ++/* ++** main: ++** ... ++** stp x29, x30, \[sp, #?-[0-9]+\]! ++** ... ++** sub sp, sp, #[0-9]+ ++** ... ++** str x[0-9]+, \[x29, #?-8\] ++** ... ++*/ ++int f(const char *); ++void g(void *); ++int main(int argc, char* argv[]) ++{ ++ int a; ++ int b; ++ char c[2+f(argv[1])]; ++ int d[0x100]; ++ char y; ++ ++ y=42; a=4; b=10; ++ c[0] = 'h'; c[1] = '\0'; ++ ++ c[f(argv[2])] = '\0'; ++ ++ __builtin_printf("%d %d\n%s\n", a, b, c); ++ g(d); ++ ++ return 0; ++} +-- +2.34.1 +