Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llvm/include/llvm/CodeGen/MachineFrameInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ class CalleeSavedInfo {
/// Flag indicating whether the register is spilled to stack or another
/// register.
bool SpilledToReg = false;
/// Flag indicating whether this CSI has been handled by the target and can be
/// skipped by the generic code in the prolog/epilog inserter.
bool IsHandledByTarget = false;

public:
explicit CalleeSavedInfo(MCRegister R, int FI = 0) : Reg(R), FrameIdx(FI) {}
Expand All @@ -61,6 +64,7 @@ class CalleeSavedInfo {
MCRegister getReg() const { return Reg; }
int getFrameIdx() const { return FrameIdx; }
MCRegister getDstReg() const { return DstReg; }
void setReg(MCRegister R) { Reg = R; }
void setFrameIdx(int FI) {
FrameIdx = FI;
SpilledToReg = false;
Expand All @@ -72,6 +76,9 @@ class CalleeSavedInfo {
bool isRestored() const { return Restored; }
void setRestored(bool R) { Restored = R; }
bool isSpilledToReg() const { return SpilledToReg; }

bool isHandledByTarget() const { return IsHandledByTarget; }
void setHandledByTarget() { IsHandledByTarget = true; }
};

/// The MachineFrameInfo class represents an abstract stack frame until
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/CodeGen/PrologEpilogInserter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,7 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
for (auto &CS : CSI) {
// If the target has spilled this register to another register, we don't
// need to allocate a stack slot.
if (CS.isSpilledToReg())
if (CS.isSpilledToReg() || CS.isHandledByTarget())
continue;

MCRegister Reg = CS.getReg();
Expand Down Expand Up @@ -604,6 +604,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
MachineBasicBlock::iterator I = SaveBlock.begin();
if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
for (const CalleeSavedInfo &CS : CSI) {
if (CS.isHandledByTarget())
continue;
// Insert the spill to the stack frame.
MCRegister Reg = CS.getReg();

Expand Down Expand Up @@ -634,6 +636,9 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,

if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
for (const CalleeSavedInfo &CI : reverse(CSI)) {
if (CI.isHandledByTarget())
continue;

MCRegister Reg = CI.getReg();
if (CI.isSpilledToReg()) {
BuildMI(RestoreBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), Reg)
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1239,6 +1239,14 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
"v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
>;

// Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and
// restoring the callee-saved registers.
def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr",
"UseBlockVGPROpsForCSR",
"true",
"Use block load/store for VGPR callee saved registers"
>;

// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
Expand Down
35 changes: 35 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "AMDGPUMachineFunction.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/IR/Constants.h"
Expand Down Expand Up @@ -239,6 +240,36 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
return AsmPrinter::lowerConstant(CV);
}

static void emitVGPRBlockComment(const MachineInstr *MI, MCStreamer &OS) {
// The instruction will only transfer a subset of the registers in the block,
// based on the mask that is stored in m0. We could search for the instruction
// that sets m0, but most of the time we'll already have the mask stored in
// the machine function info. Try to use that. This assumes that we only use
// block loads/stores for CSR spills.
const MachineFunction *MF = MI->getParent()->getParent();
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();

Register RegBlock =
TII->getNamedOperand(*MI, MI->mayLoad() ? AMDGPU::OpName::vdst
: AMDGPU::OpName::vdata)
->getReg();
Register FirstRegInBlock = TRI.getSubReg(RegBlock, AMDGPU::sub0);
uint32_t Mask = MFI->getMaskForVGPRBlockOps(RegBlock);

SmallString<512> TransferredRegs;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can avoid the temporary SmallString, emitRawComment uses Twine anyway

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wasn't Twine only meant to be used in function arguments?

Or did you mean to emit one comment for each transferred register?

for (unsigned I = 0; I < 32; ++I) {
if (Mask & (1 << I)) {
(llvm::Twine(" ") + TRI.getName(FirstRegInBlock + I))
.toVector(TransferredRegs);
}
}

if (!TransferredRegs.empty())
OS.emitRawComment(" transferring at most " + TransferredRegs);
}

void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
// FIXME: Enable feature predicate checks once all the test pass.
// AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
Expand Down Expand Up @@ -327,6 +358,10 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
return;
}

if (STI.getInstrInfo()->isBlockLoadStore(MI->getOpcode()))
if (isVerbose())
emitVGPRBlockComment(MI, *OutStreamer);

MCInst TmpInst;
MCInstLowering.lower(MI, TmpInst);
EmitToStreamer(*OutStreamer, TmpInst);
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasMinimum3Maximum3PKF16 = false;

bool RequiresCOV6 = false;
bool UseBlockVGPROpsForCSR = false;

// Dummy feature to use for assembler in tablegen.
bool FeatureDisable = false;
Expand Down Expand Up @@ -1265,6 +1266,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool requiresCodeObjectV6() const { return RequiresCOV6; }

bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }

bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }

bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
Expand Down
198 changes: 198 additions & 0 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1694,6 +1694,109 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
}
}

static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
const GCNSubtarget &ST,
const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI,
unsigned &MinCSFrameIndex,
unsigned &MaxCSFrameIndex) {
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &MFI = MF.getFrameInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *MRI = ST.getRegisterInfo();

assert(std::is_sorted(CSI.begin(), CSI.end(),
[](const CalleeSavedInfo &A, const CalleeSavedInfo &B) {
return A.getReg() < B.getReg();
}) &&
"Callee saved registers not sorted");

auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) {
return !CSI.isSpilledToReg() &&
MRI->isVGPR(MF.getRegInfo(), CSI.getReg()) &&
!FuncInfo->isWWMReservedRegister(CSI.getReg());
};

auto CSEnd = CSI.end();
for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) {
Register Reg = CSIt->getReg();
if (!CanUseBlockOps(*CSIt))
continue;

// Find all the regs that will fit in a 32-bit mask starting at the current
// reg and build said mask. It should have 1 for every register that's
// included, with the current register as the least significant bit.
uint32_t Mask = 1;
CSEnd = std::remove_if(
CSIt + 1, CSEnd, [&](const CalleeSavedInfo &CSI) -> bool {
if (CanUseBlockOps(CSI) && CSI.getReg() < Reg + 32) {
Mask |= 1 << (CSI.getReg() - Reg);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems a little odd to build the Mask through the predicate, but I guess it will be deterministic even if for example the predicate function was called multiple times on the same element.

return true;
} else {
return false;
}
});

const TargetRegisterClass *BlockRegClass = &AMDGPU::VReg_1024RegClass;
Register RegBlock =
MRI->getMatchingSuperReg(Reg, AMDGPU::sub0, BlockRegClass);
if (!RegBlock) {
// We couldn't find a super register for the block. This can happen if
// the register we started with is too high (e.g. v232 if the maximum is
// v255). We therefore try to get the last register block and figure out
// the mask from there.
Register LastBlockStart =
AMDGPU::VGPR0 + alignDown(Reg - AMDGPU::VGPR0, 32);
RegBlock =
MRI->getMatchingSuperReg(LastBlockStart, AMDGPU::sub0, BlockRegClass);
assert(RegBlock && MRI->isSubRegister(RegBlock, Reg) &&
"Couldn't find super register");
int RegDelta = Reg - LastBlockStart;
assert(RegDelta > 0 && llvm::countl_zero(Mask) >= RegDelta &&
"Bad shift amount");
Mask <<= RegDelta;
}

FuncInfo->setMaskForVGPRBlockOps(RegBlock, Mask);

// The stack objects can be a bit smaller than the register block if we know
// some of the high bits of Mask are 0. This may happen often with calling
// conventions where the caller and callee-saved VGPRs are interleaved at
// a small boundary (e.g. 8 or 16).
int UnusedBits = llvm::countl_zero(Mask);
unsigned BlockSize = MRI->getSpillSize(*BlockRegClass) - UnusedBits * 4;
int FrameIdx =
MFI.CreateStackObject(BlockSize, MRI->getSpillAlign(*BlockRegClass),
/*isSpillSlot=*/true);
if ((unsigned)FrameIdx < MinCSFrameIndex)
MinCSFrameIndex = FrameIdx;
if ((unsigned)FrameIdx > MaxCSFrameIndex)
MaxCSFrameIndex = FrameIdx;

CSIt->setFrameIdx(FrameIdx);
CSIt->setReg(RegBlock);
CSIt->setHandledByTarget();
}
CSI.erase(CSEnd, CSI.end());
}

bool SIFrameLowering::assignCalleeSavedSpillSlots(
MachineFunction &MF, const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
unsigned &MaxCSFrameIndex) const {
if (CSI.empty())
return true; // Early exit if no callee saved registers are modified!

const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();

if (UseVGPRBlocks)
assignSlotsUsingVGPRBlocks(MF, ST, TRI, CSI, MinCSFrameIndex,
MaxCSFrameIndex);

return assignCalleeSavedSpillSlots(MF, TRI, CSI);
}

bool SIFrameLowering::assignCalleeSavedSpillSlots(
MachineFunction &MF, const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI) const {
Expand Down Expand Up @@ -1763,6 +1866,101 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
return true;
}

bool SIFrameLowering::spillCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
if (!ST.useVGPRBlockOpsForCSR())
return false;

MachineFrameInfo &FrameInfo = MF->getFrameInfo();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const SIInstrInfo *TII = ST.getInstrInfo();
SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();

for (const CalleeSavedInfo &CS : CSI) {
Register Reg = CS.getReg();
if (!CS.isHandledByTarget())
continue;

// Build a scratch block store.
uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg);
int FrameIndex = CS.getFrameIdx();
MachinePointerInfo PtrInfo =
MachinePointerInfo::getFixedStack(*MF, FrameIndex);
MachineMemOperand *MMO =
MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
FrameInfo.getObjectSize(FrameIndex),
FrameInfo.getObjectAlign(FrameIndex));

BuildMI(MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_SAVE))
.addReg(Reg, getKillRegState(false))
.addFrameIndex(FrameIndex)
.addReg(MFI->getStackPtrOffsetReg())
.addImm(0)
.addImm(Mask)
.addMemOperand(MMO);

FuncInfo->setHasSpilledVGPRs();

// Add the register to the liveins. This is necessary because if any of the
// VGPRs in the register block is reserved (e.g. if it's a WWM register),
// then the whole block will be marked as reserved and `updateLiveness` will
// skip it.
MBB.addLiveIn(Reg);
}

return false;
}

bool SIFrameLowering::restoreCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
if (!ST.useVGPRBlockOpsForCSR())
return false;

SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &MFI = MF->getFrameInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *SITRI = static_cast<const SIRegisterInfo *>(TRI);
for (const CalleeSavedInfo &CS : reverse(CSI)) {
if (!CS.isHandledByTarget())
continue;

// Build a scratch block load.
Register Reg = CS.getReg();
uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg);
int FrameIndex = CS.getFrameIdx();
MachinePointerInfo PtrInfo =
MachinePointerInfo::getFixedStack(*MF, FrameIndex);
MachineMemOperand *MMO = MF->getMachineMemOperand(
PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
MFI.getObjectAlign(FrameIndex));

auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), Reg)
.addFrameIndex(FrameIndex)
.addReg(FuncInfo->getStackPtrOffsetReg())
.addImm(0)
.addImm(Mask)
.addMemOperand(MMO);
SITRI->addImplicitUsesForBlockCSRLoad(MIB, Reg);

// Add the register to the liveins. This is necessary because if any of the
// VGPRs in the register block is reserved (e.g. if it's a WWM register),
// then the whole block will be marked as reserved and `updateLiveness` will
// skip it.
if (!MBB.isLiveIn(Reg))
MBB.addLiveIn(Reg);
}

return false;
}

MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF,
MachineBasicBlock &MBB,
Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,23 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI) const override;

bool assignCalleeSavedSpillSlots(MachineFunction &MF,
const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI,
unsigned &MinCSFrameIndex,
unsigned &MaxCSFrameIndex) const override;

bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI) const override;

bool
restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
MutableArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI) const override;

bool allocateScavengingFrameIndexesNearIncomingSP(
const MachineFunction &MF) const override;

Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,18 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
}

static bool isBlockLoadStore(uint16_t Opcode) {
switch (Opcode) {
case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
case AMDGPU::SCRATCH_STORE_BLOCK_SADDR:
case AMDGPU::SCRATCH_LOAD_BLOCK_SADDR:
return true;
default:
return false;
}
}

static bool isEXP(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
}
Expand Down
Loading