diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index 6d78c2af8256..cbb1bff9cd13 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -596,15 +596,27 @@ impl ABIMachineSpec for AArch64MachineDeps { } if setup_frame { - // stp fp (x29), lr (x30), [sp, #-16]! - insts.push(Inst::StoreP64 { - rt: fp_reg(), - rt2: link_reg(), - mem: PairAMode::SPPreIndexed { - simm7: SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(), - }, - flags: MemFlags::trusted(), - }); + let setup_area_size = frame_layout.setup_area_size as i64; + if setup_area_size == 8 { + // str fp, [sp, #-8]! + insts.push(Inst::Store64 { + rd: fp_reg(), + mem: AMode::SPPreIndexed { + simm9: SImm9::maybe_from_i64(-setup_area_size).unwrap(), + }, + flags: MemFlags::trusted(), + }); + } else { + // stp fp (x29), lr (x30), [sp, #-16]! + insts.push(Inst::StoreP64 { + rt: fp_reg(), + rt2: link_reg(), + mem: PairAMode::SPPreIndexed { + simm7: SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(), + }, + flags: MemFlags::trusted(), + }); + } if flags.unwind_info() { insts.push(Inst::Unwind { @@ -645,15 +657,27 @@ impl ABIMachineSpec for AArch64MachineDeps { // clobber-restore code (which also frees the fixed frame). Hence, there // is no need for the usual `mov sp, fp` here. - // `ldp fp, lr, [sp], #16` - insts.push(Inst::LoadP64 { - rt: writable_fp_reg(), - rt2: writable_link_reg(), - mem: PairAMode::SPPostIndexed { - simm7: SImm7Scaled::maybe_from_i64(16, types::I64).unwrap(), - }, - flags: MemFlags::trusted(), - }); + let setup_area_size = frame_layout.setup_area_size as i64; + if setup_area_size == 8 { + // `ldr fp, [sp], #8` + insts.push(Inst::ULoad64 { + rd: writable_fp_reg(), + mem: AMode::SPPostIndexed { + simm9: SImm9::maybe_from_i64(setup_area_size).unwrap(), + }, + flags: MemFlags::trusted(), + }); + } else { + // `ldp fp, lr, [sp], #16` + insts.push(Inst::LoadP64 { + rt: writable_fp_reg(), + rt2: writable_link_reg(), + mem: PairAMode::SPPostIndexed { + simm7: SImm7Scaled::maybe_from_i64(setup_area_size, types::I64).unwrap(), + }, + flags: MemFlags::trusted(), + }); + } } if call_conv == isa::CallConv::Tail && frame_layout.tail_args_size > 0 { @@ -1144,16 +1168,20 @@ impl ABIMachineSpec for AArch64MachineDeps { // Compute linkage frame size. let setup_area_size = if flags.preserve_frame_pointers() - || function_calls != FunctionCalls::None // The function arguments that are passed on the stack are addressed // relative to the Frame Pointer. + || flags.unwind_info() || incoming_args_size > 0 || clobber_size > 0 || fixed_frame_storage_size > 0 { 16 // FP, LR } else { - 0 + match function_calls { + FunctionCalls::Regular => 16, + FunctionCalls::None => 0, + FunctionCalls::TailOnly => 8, + } }; // Return FrameLayout structure. diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 13ffccb38f4a..d29b014d3cb5 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -3635,19 +3635,32 @@ fn emit_return_call_common_sequence( // clobber-restore code (which also frees the fixed frame). Hence, there // is no need for the usual `mov sp, fp` here. - // `ldp fp, lr, [sp], #16` - Inst::LoadP64 { - rt: writable_fp_reg(), - rt2: writable_link_reg(), - mem: PairAMode::SPPostIndexed { - // TODO: we could fold the increment for incoming_args_diff here, as long as that - // value is less than 502*8, by adding it to `setup_area_size`. - // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDP--Load-Pair-of-Registers- - simm7: SImm7Scaled::maybe_from_i64(i64::from(setup_area_size), types::I64).unwrap(), - }, - flags: MemFlags::trusted(), + if setup_area_size == 8 { + // `ldr fp, [sp], #8` + Inst::ULoad64 { + rd: writable_fp_reg(), + mem: AMode::SPPostIndexed { + simm9: SImm9::maybe_from_i64(i64::from(setup_area_size)).unwrap(), + }, + flags: MemFlags::trusted(), + } + .emit(sink, emit_info, state); + } else { + // `ldp fp, lr, [sp], #16` + Inst::LoadP64 { + rt: writable_fp_reg(), + rt2: writable_link_reg(), + mem: PairAMode::SPPostIndexed { + // TODO: we could fold the increment for incoming_args_diff here, as long as that + // value is less than 502*8, by adding it to `setup_area_size`. + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDP--Load-Pair-of-Registers- + simm7: SImm7Scaled::maybe_from_i64(i64::from(setup_area_size), types::I64) + .unwrap(), + }, + flags: MemFlags::trusted(), + } + .emit(sink, emit_info, state); } - .emit(sink, emit_info, state); } // Adjust SP to account for the possible over-allocation in the prologue. diff --git a/cranelift/filetests/filetests/isa/aarch64/tail-call-frame-optimization.clif b/cranelift/filetests/filetests/isa/aarch64/tail-call-frame-optimization.clif new file mode 100644 index 000000000000..0c6877d625ac --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/tail-call-frame-optimization.clif @@ -0,0 +1,130 @@ +test compile precise-output +set unwind_info=false +set preserve_frame_pointers=false +target aarch64 + +;; Test 1: Tail calling convention with tail-only calls +;; FunctionCalls::TailOnly → setup_area_size = 8 → optimized frame +function %tail_only_function() -> i64 tail { + fn0 = colocated %target_func() -> i64 tail + +block0: + return_call fn0() +} + +; VCode: +; str fp, [sp, #-8]! +; mov fp, sp +; block0: +; return_call TestCase(%target_func) new_stack_arg_size:0 +; +; Disassembled: +; block0: ; offset 0x0 +; str x29, [sp, #-8]! +; mov x29, sp +; block1: ; offset 0x8 +; ldr x29, [sp], #8 +; b #0xc ; reloc_external Call %target_func 0 + +;; Test 2: SystemV calling convention with regular calls +;; FunctionCalls::Regular → setup_area_size = 16 → standard frame +function %systemv_regular_function() -> i64 system_v { + fn0 = colocated %target_func() -> i64 system_v + +block0: + v0 = call fn0() + return v0 +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; block0: +; bl 0 +; ldp fp, lr, [sp], #16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; block1: ; offset 0x8 +; bl #8 ; reloc_external Call %target_func 0 +; ldp x29, x30, [sp], #0x10 +; ret + +;; Test 3: Tail calling convention with conditional tail calls +;; Multiple return_call instructions still → FunctionCalls::TailOnly → optimized frame +function %tail_only_conditional(i64) -> i64 tail { + fn0 = colocated %target_func() -> i64 tail + fn1 = colocated %other_func() -> i64 tail + +block0(v0: i64): + v1 = iconst.i64 0 + v2 = icmp sgt v0, v1 + brif v2, block1, block2 + +block1: + return_call fn0() + +block2: + return_call fn1() +} + +; VCode: +; str fp, [sp, #-8]! +; mov fp, sp +; block0: +; subs xzr, x2, #0 +; b.gt label2 ; b label1 +; block1: +; return_call TestCase(%other_func) new_stack_arg_size:0 +; block2: +; return_call TestCase(%target_func) new_stack_arg_size:0 +; +; Disassembled: +; block0: ; offset 0x0 +; str x29, [sp, #-8]! +; mov x29, sp +; block1: ; offset 0x8 +; cmp x2, #0 +; b.gt #0x18 +; block2: ; offset 0x10 +; ldr x29, [sp], #8 +; b #0x14 ; reloc_external Call %other_func 0 +; block3: ; offset 0x18 +; ldr x29, [sp], #8 +; b #0x1c ; reloc_external Call %target_func 0 + +;; Target functions for testing +function %target_func() -> i64 { +block0: + v0 = iconst.i64 42 + return v0 +} + +; VCode: +; block0: +; movz x0, #42 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; mov x0, #0x2a +; ret + +function %other_func() -> i64 { +block0: + v0 = iconst.i64 24 + return v0 +} + +; VCode: +; block0: +; movz x0, #24 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; mov x0, #0x18 +; ret