Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1688,6 +1688,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
}
case AArch64::InOutZAUsePseudo:
case AArch64::RequiresZASavePseudo:
case AArch64::SMEStateAllocPseudo:
case AArch64::COALESCER_BARRIER_FPR16:
case AArch64::COALESCER_BARRIER_FPR32:
case AArch64::COALESCER_BARRIER_FPR64:
Expand Down
25 changes: 24 additions & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8291,7 +8291,30 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
if (Subtarget->hasCustomCallingConv())
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);

if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
if (getTM().useNewSMEABILowering() && !Attrs.hasAgnosticZAInterface()) {
if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
SDValue Size;
if (Attrs.hasZAState()) {
SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
DAG.getConstant(1, DL, MVT::i32));
Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
}
if (Size) {
SDValue Buffer = DAG.getNode(
ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
{Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
Chain = Buffer.getValue(1);

Register BufferPtr =
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
DAG.getVTList(MVT::Other), Chain);
FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
MFI.CreateVariableSizedObject(Align(16), nullptr);
}
}
} else {
// Old SME ABI lowering (deprecated):
// Create a 16 Byte TPIDR2 object. The dynamic buffer
// will be expanded and stored in the static object later using a
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// Holds the SME function attributes (streaming mode, ZA/ZT0 state).
SMEAttrs SMEFnAttrs;

// Holds the TPIDR2 block if allocated early (for Windows/stack probes
// support).
Register EarlyAllocSMESaveBuffer = AArch64::NoRegister;

// Note: The following properties are only used for the old SME ABI lowering:
/// The frame-index for the TPIDR2 object used for lazy saves.
TPIDR2Object TPIDR2;
Expand All @@ -256,6 +260,12 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
const override;

void setEarlyAllocSMESaveBuffer(Register Ptr) {
EarlyAllocSMESaveBuffer = Ptr;
}

Register getEarlyAllocSMESaveBuffer() { return EarlyAllocSMESaveBuffer; }

// Old SME ABI lowering state getters/setters:
Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ let hasSideEffects = 1, isMeta = 1 in {
def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
}

def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>;

def CommitZASavePseudo
: Pseudo<(outs),
(ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>,
Expand All @@ -108,6 +110,11 @@ def AArch64_requires_za_save
[SDNPHasChain, SDNPInGlue]>;
def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;

def AArch64_sme_state_alloc
: SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>,
[SDNPHasChain]>;
def : Pat<(AArch64_sme_state_alloc), (SMEStateAllocPseudo)>;

//===----------------------------------------------------------------------===//
// Instruction naming conventions.
//===----------------------------------------------------------------------===//
Expand Down
34 changes: 26 additions & 8 deletions llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ struct MachineSMEABI : public MachineFunctionPass {
SmallVector<BlockInfo> Blocks;
SmallVector<ZAState> BundleStates;
std::optional<TPIDR2State> TPIDR2Block;
std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
} State;

MachineFunction *MF = nullptr;
Expand Down Expand Up @@ -298,6 +299,12 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
MachineBasicBlock::iterator MBBI(MI);
LiveUnits.stepBackward(MI);
LiveRegs PhysLiveRegs = GetPhysLiveRegs();
// The SMEStateAllocPseudo marker is added to a function if the save
// buffer was allocated in SelectionDAG. It marks the end of the
// allocation -- which is a safe point for this pass to insert any TPIDR2
// block setup.
if (MI.getOpcode() == AArch64::SMEStateAllocPseudo)
State.AfterSMEProloguePt = MBBI;
auto [NeededState, InsertPt] = getZAStateBeforeInst(
*TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
assert((InsertPt == MBBI ||
Expand Down Expand Up @@ -529,23 +536,27 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
void MachineSMEABI::emitAllocateLazySaveBuffer(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
MachineFrameInfo &MFI = MF->getFrameInfo();
auto *AFI = MF->getInfo<AArch64FunctionInfo>();

DebugLoc DL = getDebugLoc(MBB, MBBI);
Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
Register Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
Register Buffer = AFI->getEarlyAllocSMESaveBuffer();

// Calculate SVL.
BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1);

// 1. Allocate the lazy save buffer.
{
// TODO This function grows the stack with a subtraction, which doesn't work
// on Windows. Some refactoring to share the functionality in
// LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
// supports SME
if (Buffer == AArch64::NoRegister) {
// TODO: On Windows, we allocate the lazy save buffer in SelectionDAG (so
// Buffer != AArch64::NoRegister). This is done to reuse the existing
// expansions (which can insert stack checks). This works, but it means we
// will always allocate the lazy save buffer (even if the function contains
// no lazy saves). If we want to handle Windows here, we'll need to
// implement something similar to LowerWindowsDYNAMIC_STACKALLOC.
assert(!Subtarget->isTargetWindows() &&
"Lazy ZA save is not yet supported on Windows");
Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
// Get original stack pointer.
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP)
.addReg(AArch64::SP);
Expand Down Expand Up @@ -686,8 +697,15 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {

// Allocate save buffer (if needed).
if (State.TPIDR2Block) {
MachineBasicBlock &EntryBlock = MF.front();
emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
if (State.AfterSMEProloguePt) {
// Note: With inline stack probes the AfterSMEProloguePt may not be in the
// entry block (due to the probing loop).
emitAllocateLazySaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
*State.AfterSMEProloguePt);
} else {
MachineBasicBlock &EntryBlock = MF.front();
emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
}
}

return true;
Expand Down
39 changes: 39 additions & 0 deletions llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s

declare void @private_za_callee()
declare void @shared_za_callee() "aarch64_inout_za"

define void @test_lazy_save() nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mul x9, x8, x8
; CHECK-NEXT: lsr x15, x9, #4
; CHECK-NEXT: bl __chkstk
; CHECK-NEXT: sub x9, sp, x15, lsl #4
; CHECK-NEXT: mov sp, x9
; CHECK-NEXT: sub x10, x29, #16
; CHECK-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
; CHECK-NEXT: cbnz x8, .LBB0_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: bl __arm_tpidr2_restore
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @private_za_callee()
ret void
}
26 changes: 17 additions & 9 deletions llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ exit:
ret float %ret
}

; FIXME: This is missing stack probes with -aarch64-new-sme-abi.
define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" {
; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe:
; CHECK: // %bb.0:
Expand Down Expand Up @@ -157,26 +156,35 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
; CHECK-NEWLOWERING-NEXT: mov x9, sp
; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
; CHECK-NEWLOWERING-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEWLOWERING-NEXT: cmp sp, x9
; CHECK-NEWLOWERING-NEXT: b.le .LBB2_3
; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
; CHECK-NEWLOWERING-NEXT: str xzr, [sp]
; CHECK-NEWLOWERING-NEXT: b .LBB2_1
; CHECK-NEWLOWERING-NEXT: .LBB2_3:
; CHECK-NEWLOWERING-NEXT: mov sp, x9
; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp]
; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_2
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_5
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %use_b
; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000
; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1
; CHECK-NEWLOWERING-NEXT: b .LBB2_3
; CHECK-NEWLOWERING-NEXT: .LBB2_2: // %use_c
; CHECK-NEWLOWERING-NEXT: b .LBB2_6
; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %use_c
; CHECK-NEWLOWERING-NEXT: fmov s0, s1
; CHECK-NEWLOWERING-NEXT: bl cosf
; CHECK-NEWLOWERING-NEXT: .LBB2_3: // %exit
; CHECK-NEWLOWERING-NEXT: .LBB2_6: // %exit
; CHECK-NEWLOWERING-NEXT: smstart za
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_5
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_8
; CHECK-NEWLOWERING-NEXT: // %bb.7: // %exit
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %exit
; CHECK-NEWLOWERING-NEXT: .LBB2_8: // %exit
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEWLOWERING-NEXT: mov sp, x29
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
Expand Down
Loading