Skip to content

Commit 466101e

Browse files
committed
[AArch64][SME] Support Windows/stack probes in MachineSMEABIPass
On Windows or with stack probes on other targets, additional code needs to be inserted after dynamic stack allocations to validate stack accesses and/or ensure enough stack space has been allocated. Rather than handle this case in the MachineSMEABIPass (like we do for the standard case), we allocate the memory for the lazy save buffer in SelectionDAG, which allows the existing expansions to emit the correct code. Note: This means in these cases, we may allocate a lazy save buffer when there are no lazy saves present in the function (as we have to allocate the buffer before the MachineSMEABIPass runs). Change-Id: If89ab54c4de79f6fe5513a6b387e9e349f7bc7d1
1 parent f82054e commit 466101e

File tree

7 files changed

+160
-14
lines changed

7 files changed

+160
-14
lines changed

llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1688,6 +1688,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
16881688
}
16891689
case AArch64::InOutZAUsePseudo:
16901690
case AArch64::RequiresZASavePseudo:
1691+
case AArch64::SMEStateAllocPseudo:
16911692
case AArch64::COALESCER_BARRIER_FPR16:
16921693
case AArch64::COALESCER_BARRIER_FPR32:
16931694
case AArch64::COALESCER_BARRIER_FPR64:

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8290,7 +8290,39 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
82908290
if (Subtarget->hasCustomCallingConv())
82918291
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
82928292

8293-
if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
8293+
if (getTM().useNewSMEABILowering() && !Attrs.hasAgnosticZAInterface()) {
8294+
if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
8295+
SDValue Size;
8296+
if (Attrs.hasZAState()) {
8297+
SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8298+
DAG.getConstant(1, DL, MVT::i32));
8299+
Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8300+
} else if (Attrs.hasAgnosticZAInterface()) {
8301+
RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
8302+
SDValue Callee = DAG.getExternalSymbol(
8303+
getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
8304+
auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
8305+
TargetLowering::CallLoweringInfo CLI(DAG);
8306+
CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8307+
getLibcallCallingConv(LC), RetTy, Callee, {});
8308+
std::tie(Size, Chain) = LowerCallTo(CLI);
8309+
}
8310+
if (Size) {
8311+
SDValue Buffer = DAG.getNode(
8312+
ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8313+
{Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8314+
Chain = Buffer.getValue(1);
8315+
8316+
Register BufferPtr =
8317+
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8318+
Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8319+
Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
8320+
DAG.getVTList(MVT::Other), Chain);
8321+
FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
8322+
MFI.CreateVariableSizedObject(Align(16), nullptr);
8323+
}
8324+
}
8325+
} else {
82948326
// Old SME ABI lowering (deprecated):
82958327
// Create a 16 Byte TPIDR2 object. The dynamic buffer
82968328
// will be expanded and stored in the static object later using a

llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
242242
// Holds the SME function attributes (streaming mode, ZA/ZT0 state).
243243
SMEAttrs SMEFnAttrs;
244244

245+
// Holds the TPIDR2 block if allocated early (for Windows/stack probes
246+
// support).
247+
Register EarlyAllocSMESaveBuffer = AArch64::NoRegister;
248+
245249
// Note: The following properties are only used for the old SME ABI lowering:
246250
/// The frame-index for the TPIDR2 object used for lazy saves.
247251
TPIDR2Object TPIDR2;
@@ -260,6 +264,12 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
260264
const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
261265
const override;
262266

267+
void setEarlyAllocSMESaveBuffer(Register Ptr) {
268+
EarlyAllocSMESaveBuffer = Ptr;
269+
}
270+
271+
Register getEarlyAllocSMESaveBuffer() { return EarlyAllocSMESaveBuffer; }
272+
263273
// Old SME ABI lowering state getters/setters:
264274
Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
265275
void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };

llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ let hasSideEffects = 1, isMeta = 1 in {
9999
def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
100100
}
101101

102+
def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
103+
102104
def CommitZASavePseudo
103105
: Pseudo<(outs),
104106
(ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>,
@@ -114,6 +116,11 @@ def AArch64_requires_za_save
114116
[SDNPHasChain, SDNPInGlue]>;
115117
def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;
116118

119+
def AArch64_sme_state_alloc
120+
: SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>,
121+
[SDNPHasChain]>;
122+
def : Pat<(AArch64_sme_state_alloc), (SMEStateAllocPseudo)>;
123+
117124
//===----------------------------------------------------------------------===//
118125
// Instruction naming conventions.
119126
//===----------------------------------------------------------------------===//

llvm/lib/Target/AArch64/MachineSMEABIPass.cpp

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ struct MachineSMEABI : public MachineFunctionPass {
249249
SmallVector<BlockInfo> Blocks;
250250
SmallVector<ZAState> BundleStates;
251251
std::optional<TPIDR2State> TPIDR2Block;
252+
std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
252253
} State;
253254

254255
MachineFunction *MF = nullptr;
@@ -298,6 +299,13 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
298299
MachineBasicBlock::iterator MBBI(MI);
299300
LiveUnits.stepBackward(MI);
300301
LiveRegs PhysLiveRegs = GetPhysLiveRegs();
302+
// The SMEStateAllocPseudo marker is added to a function if the save
303+
// buffer was allocated in SelectionDAG. It marks the end of the
304+
// allocation -- which is a safe point for this pass to insert any TPIDR2
305+
// block setup.
306+
if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
307+
State.AfterSMEProloguePt = MBBI;
308+
}
301309
auto [NeededState, InsertPt] = getZAStateBeforeInst(
302310
*TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
303311
assert((InsertPt == MBBI ||
@@ -529,23 +537,25 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
529537
void MachineSMEABI::emitAllocateLazySaveBuffer(
530538
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
531539
MachineFrameInfo &MFI = MF->getFrameInfo();
540+
auto *AFI = MF->getInfo<AArch64FunctionInfo>();
532541

533542
DebugLoc DL = getDebugLoc(MBB, MBBI);
534543
Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
535544
Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
536-
Register Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
545+
Register Buffer = AFI->getEarlyAllocSMESaveBuffer();
537546

538547
// Calculate SVL.
539548
BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1);
540549

541550
// 1. Allocate the lazy save buffer.
542-
{
551+
if (Buffer == AArch64::NoRegister) {
543552
// TODO This function grows the stack with a subtraction, which doesn't work
544553
// on Windows. Some refactoring to share the functionality in
545554
// LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
546555
// supports SME
547556
assert(!Subtarget->isTargetWindows() &&
548557
"Lazy ZA save is not yet supported on Windows");
558+
Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
549559
// Get original stack pointer.
550560
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP)
551561
.addReg(AArch64::SP);
@@ -686,8 +696,15 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
686696

687697
// Allocate save buffer (if needed).
688698
if (State.TPIDR2Block) {
689-
MachineBasicBlock &EntryBlock = MF.front();
690-
emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
699+
if (State.AfterSMEProloguePt) {
700+
// Note: With inline stack probes the AfterSMEProloguePt may not be in the
701+
// entry block (due to the probing loop).
702+
emitAllocateLazySaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
703+
*State.AfterSMEProloguePt);
704+
} else {
705+
MachineBasicBlock &EntryBlock = MF.front();
706+
emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
707+
}
691708
}
692709

693710
return true;
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme < %s | FileCheck %s
3+
; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-NEWLOWERING
4+
5+
declare void @private_za_callee()
6+
declare void @shared_za_callee() "aarch64_inout_za"
7+
8+
define void @test_lazy_save() nounwind "aarch64_inout_za" {
9+
; CHECK-LABEL: test_lazy_save:
10+
; CHECK: // %bb.0:
11+
; CHECK-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
12+
; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
13+
; CHECK-NEXT: mov x29, sp
14+
; CHECK-NEXT: sub sp, sp, #16
15+
; CHECK-NEXT: rdsvl x8, #1
16+
; CHECK-NEXT: mul x9, x8, x8
17+
; CHECK-NEXT: lsr x15, x9, #4
18+
; CHECK-NEXT: bl __chkstk
19+
; CHECK-NEXT: sub x9, sp, x15, lsl #4
20+
; CHECK-NEXT: mov sp, x9
21+
; CHECK-NEXT: stur x9, [x29, #-16]
22+
; CHECK-NEXT: sub x9, x29, #16
23+
; CHECK-NEXT: sturh wzr, [x29, #-6]
24+
; CHECK-NEXT: stur wzr, [x29, #-4]
25+
; CHECK-NEXT: sturh w8, [x29, #-8]
26+
; CHECK-NEXT: msr TPIDR2_EL0, x9
27+
; CHECK-NEXT: bl private_za_callee
28+
; CHECK-NEXT: smstart za
29+
; CHECK-NEXT: mrs x8, TPIDR2_EL0
30+
; CHECK-NEXT: sub x0, x29, #16
31+
; CHECK-NEXT: cbnz x8, .LBB0_2
32+
; CHECK-NEXT: // %bb.1:
33+
; CHECK-NEXT: bl __arm_tpidr2_restore
34+
; CHECK-NEXT: .LBB0_2:
35+
; CHECK-NEXT: msr TPIDR2_EL0, xzr
36+
; CHECK-NEXT: mov sp, x29
37+
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
38+
; CHECK-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
39+
; CHECK-NEXT: ret
40+
;
41+
; CHECK-NEWLOWERING-LABEL: test_lazy_save:
42+
; CHECK-NEWLOWERING: // %bb.0:
43+
; CHECK-NEWLOWERING-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
44+
; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
45+
; CHECK-NEWLOWERING-NEXT: mov x29, sp
46+
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
47+
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
48+
; CHECK-NEWLOWERING-NEXT: mul x9, x8, x8
49+
; CHECK-NEWLOWERING-NEXT: lsr x15, x9, #4
50+
; CHECK-NEWLOWERING-NEXT: bl __chkstk
51+
; CHECK-NEWLOWERING-NEXT: sub x9, sp, x15, lsl #4
52+
; CHECK-NEWLOWERING-NEXT: mov sp, x9
53+
; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
54+
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
55+
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
56+
; CHECK-NEWLOWERING-NEXT: bl private_za_callee
57+
; CHECK-NEWLOWERING-NEXT: smstart za
58+
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
59+
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
60+
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_2
61+
; CHECK-NEWLOWERING-NEXT: // %bb.1:
62+
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
63+
; CHECK-NEWLOWERING-NEXT: .LBB0_2:
64+
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
65+
; CHECK-NEWLOWERING-NEXT: mov sp, x29
66+
; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
67+
; CHECK-NEWLOWERING-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
68+
; CHECK-NEWLOWERING-NEXT: ret
69+
call void @private_za_callee()
70+
ret void
71+
}

llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,6 @@ exit:
103103
ret float %ret
104104
}
105105

106-
; FIXME: This is missing stack probes with -aarch64-new-sme-abi.
107106
define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" {
108107
; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe:
109108
; CHECK: // %bb.0:
@@ -165,26 +164,35 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
165164
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
166165
; CHECK-NEWLOWERING-NEXT: mov x9, sp
167166
; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
167+
; CHECK-NEWLOWERING-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
168+
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
169+
; CHECK-NEWLOWERING-NEXT: cmp sp, x9
170+
; CHECK-NEWLOWERING-NEXT: b.le .LBB2_3
171+
; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
172+
; CHECK-NEWLOWERING-NEXT: str xzr, [sp]
173+
; CHECK-NEWLOWERING-NEXT: b .LBB2_1
174+
; CHECK-NEWLOWERING-NEXT: .LBB2_3:
168175
; CHECK-NEWLOWERING-NEXT: mov sp, x9
176+
; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp]
169177
; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
170178
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
171179
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
172-
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_2
173-
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b
180+
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_5
181+
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %use_b
174182
; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000
175183
; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1
176-
; CHECK-NEWLOWERING-NEXT: b .LBB2_3
177-
; CHECK-NEWLOWERING-NEXT: .LBB2_2: // %use_c
184+
; CHECK-NEWLOWERING-NEXT: b .LBB2_6
185+
; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %use_c
178186
; CHECK-NEWLOWERING-NEXT: fmov s0, s1
179187
; CHECK-NEWLOWERING-NEXT: bl cosf
180-
; CHECK-NEWLOWERING-NEXT: .LBB2_3: // %exit
188+
; CHECK-NEWLOWERING-NEXT: .LBB2_6: // %exit
181189
; CHECK-NEWLOWERING-NEXT: smstart za
182190
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
183191
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
184-
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_5
185-
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit
192+
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_8
193+
; CHECK-NEWLOWERING-NEXT: // %bb.7: // %exit
186194
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
187-
; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %exit
195+
; CHECK-NEWLOWERING-NEXT: .LBB2_8: // %exit
188196
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
189197
; CHECK-NEWLOWERING-NEXT: mov sp, x29
190198
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload

0 commit comments

Comments
 (0)