Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1688,6 +1688,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
}
case AArch64::InOutZAUsePseudo:
case AArch64::RequiresZASavePseudo:
case AArch64::SMEStateAllocPseudo:
case AArch64::COALESCER_BARRIER_FPR16:
case AArch64::COALESCER_BARRIER_FPR32:
case AArch64::COALESCER_BARRIER_FPR64:
Expand Down
34 changes: 33 additions & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8291,7 +8291,39 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
if (Subtarget->hasCustomCallingConv())
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);

if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
if (getTM().useNewSMEABILowering() && !Attrs.hasAgnosticZAInterface()) {
if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
SDValue Size;
if (Attrs.hasZAState()) {
SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
DAG.getConstant(1, DL, MVT::i32));
Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
} else if (Attrs.hasAgnosticZAInterface()) {
RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
SDValue Callee = DAG.getExternalSymbol(
getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
getLibcallCallingConv(LC), RetTy, Callee, {});
std::tie(Size, Chain) = LowerCallTo(CLI);
}
if (Size) {
SDValue Buffer = DAG.getNode(
ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
{Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
Chain = Buffer.getValue(1);

Register BufferPtr =
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
DAG.getVTList(MVT::Other), Chain);
FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
MFI.CreateVariableSizedObject(Align(16), nullptr);
}
}
} else {
// Old SME ABI lowering (deprecated):
// Create a 16 Byte TPIDR2 object. The dynamic buffer
// will be expanded and stored in the static object later using a
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// Holds the SME function attributes (streaming mode, ZA/ZT0 state).
SMEAttrs SMEFnAttrs;

// Holds the TPIDR2 block if allocated early (for Windows/stack probes
// support).
Register EarlyAllocSMESaveBuffer = AArch64::NoRegister;

// Note: The following properties are only used for the old SME ABI lowering:
/// The frame-index for the TPIDR2 object used for lazy saves.
TPIDR2Object TPIDR2;
Expand All @@ -256,6 +260,12 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
const override;

void setEarlyAllocSMESaveBuffer(Register Ptr) {
EarlyAllocSMESaveBuffer = Ptr;
}

Register getEarlyAllocSMESaveBuffer() { return EarlyAllocSMESaveBuffer; }

// Old SME ABI lowering state getters/setters:
Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ let hasSideEffects = 1, isMeta = 1 in {
def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
}

def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>;

def CommitZASavePseudo
: Pseudo<(outs),
(ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>,
Expand All @@ -108,6 +110,11 @@ def AArch64_requires_za_save
[SDNPHasChain, SDNPInGlue]>;
def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;

def AArch64_sme_state_alloc
: SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>,
[SDNPHasChain]>;
def : Pat<(AArch64_sme_state_alloc), (SMEStateAllocPseudo)>;

//===----------------------------------------------------------------------===//
// Instruction naming conventions.
//===----------------------------------------------------------------------===//
Expand Down
35 changes: 27 additions & 8 deletions llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ struct MachineSMEABI : public MachineFunctionPass {
SmallVector<BlockInfo> Blocks;
SmallVector<ZAState> BundleStates;
std::optional<TPIDR2State> TPIDR2Block;
std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
} State;

MachineFunction *MF = nullptr;
Expand Down Expand Up @@ -298,6 +299,13 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
MachineBasicBlock::iterator MBBI(MI);
LiveUnits.stepBackward(MI);
LiveRegs PhysLiveRegs = GetPhysLiveRegs();
// The SMEStateAllocPseudo marker is added to a function if the save
// buffer was allocated in SelectionDAG. It marks the end of the
// allocation -- which is a safe point for this pass to insert any TPIDR2
// block setup.
if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
State.AfterSMEProloguePt = MBBI;
}
auto [NeededState, InsertPt] = getZAStateBeforeInst(
*TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
assert((InsertPt == MBBI ||
Expand Down Expand Up @@ -529,23 +537,27 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
void MachineSMEABI::emitAllocateLazySaveBuffer(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
MachineFrameInfo &MFI = MF->getFrameInfo();
auto *AFI = MF->getInfo<AArch64FunctionInfo>();

DebugLoc DL = getDebugLoc(MBB, MBBI);
Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
Register Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
Register Buffer = AFI->getEarlyAllocSMESaveBuffer();

// Calculate SVL.
BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1);

// 1. Allocate the lazy save buffer.
{
// TODO This function grows the stack with a subtraction, which doesn't work
// on Windows. Some refactoring to share the functionality in
// LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
// supports SME
if (Buffer == AArch64::NoRegister) {
// TODO: On Windows, we allocate the lazy save buffer in SelectionDAG (so
// Buffer != AArch64::NoRegister). This is done to reuse the existing
// expansions (which can insert stack checks). This works, but it means we
// will always allocate the lazy save buffer (even if the function contains
// no lazy saves). If we want to handle Windows here, we'll need to
// implement something similar to LowerWindowsDYNAMIC_STACKALLOC.
assert(!Subtarget->isTargetWindows() &&
"Lazy ZA save is not yet supported on Windows");
Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
// Get original stack pointer.
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP)
.addReg(AArch64::SP);
Expand Down Expand Up @@ -686,8 +698,15 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {

// Allocate save buffer (if needed).
if (State.TPIDR2Block) {
MachineBasicBlock &EntryBlock = MF.front();
emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
if (State.AfterSMEProloguePt) {
// Note: With inline stack probes the AfterSMEProloguePt may not be in the
// entry block (due to the probing loop).
emitAllocateLazySaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
*State.AfterSMEProloguePt);
} else {
MachineBasicBlock &EntryBlock = MF.front();
emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
}
}

return true;
Expand Down
39 changes: 39 additions & 0 deletions llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s

declare void @private_za_callee()
declare void @shared_za_callee() "aarch64_inout_za"

define void @test_lazy_save() nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mul x9, x8, x8
; CHECK-NEXT: lsr x15, x9, #4
; CHECK-NEXT: bl __chkstk
; CHECK-NEXT: sub x9, sp, x15, lsl #4
; CHECK-NEXT: mov sp, x9
; CHECK-NEXT: sub x10, x29, #16
; CHECK-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
; CHECK-NEXT: cbnz x8, .LBB0_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: bl __arm_tpidr2_restore
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @private_za_callee()
ret void
}
26 changes: 17 additions & 9 deletions llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ exit:
ret float %ret
}

; FIXME: This is missing stack probes with -aarch64-new-sme-abi.
define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" {
; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe:
; CHECK: // %bb.0:
Expand Down Expand Up @@ -157,26 +156,35 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
; CHECK-NEWLOWERING-NEXT: mov x9, sp
; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
; CHECK-NEWLOWERING-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEWLOWERING-NEXT: cmp sp, x9
; CHECK-NEWLOWERING-NEXT: b.le .LBB2_3
; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
; CHECK-NEWLOWERING-NEXT: str xzr, [sp]
; CHECK-NEWLOWERING-NEXT: b .LBB2_1
; CHECK-NEWLOWERING-NEXT: .LBB2_3:
; CHECK-NEWLOWERING-NEXT: mov sp, x9
; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp]
; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_2
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_5
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %use_b
; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000
; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1
; CHECK-NEWLOWERING-NEXT: b .LBB2_3
; CHECK-NEWLOWERING-NEXT: .LBB2_2: // %use_c
; CHECK-NEWLOWERING-NEXT: b .LBB2_6
; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %use_c
; CHECK-NEWLOWERING-NEXT: fmov s0, s1
; CHECK-NEWLOWERING-NEXT: bl cosf
; CHECK-NEWLOWERING-NEXT: .LBB2_3: // %exit
; CHECK-NEWLOWERING-NEXT: .LBB2_6: // %exit
; CHECK-NEWLOWERING-NEXT: smstart za
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_5
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_8
; CHECK-NEWLOWERING-NEXT: // %bb.7: // %exit
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %exit
; CHECK-NEWLOWERING-NEXT: .LBB2_8: // %exit
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEWLOWERING-NEXT: mov sp, x29
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
Expand Down