Skip to content

Commit 1ff7e72

Browse files
committed
[AMDGPU] Support block load/store for CSR
Add support for using the existing `SCRATCH_STORE_BLOCK` and `SCRATCH_LOAD_BLOCK` instructions for saving and restoring callee-saved VGPRs. This is controlled by a new subtarget feature, `block-vgpr-csr`. It does not include WWM registers - those will be saved and restored individually, just like before. This patch does not change the ABI. Use of this feature may lead to slightly increased stack usage, because the memory is not compacted if certain registers don't have to be transferred (this will happen in practice for calling conventions where the callee and caller saved registers are interleaved in groups of 8). However, if the registers at the end of the block of 32 don't have to be transferred, we don't need to use a whole 128-byte stack slot - we can trim some space off the end of the range. In order to implement this feature, we need to rely less on the target-independent code in the PrologEpilogInserter, so we override several new methods in `SIFrameLowering`. We also add new pseudos, `SI_BLOCK_SPILL_V1024_SAVE/RESTORE`. One peculiarity is that both the SI_BLOCK_V1024_RESTORE pseudo and the SCRATCH_LOAD_BLOCK instructions will have all the registers that are not transferred added as implicit uses. This is done in order to inform LiveRegUnits that those registers are not available before the restore (since we're not really restoring them - so we can't afford to scavenge them). Unfortunately, this trick doesn't work with the save, so before the save all the registers in the block will be unavailable (see the unit test).
1 parent c6e2cbe commit 1ff7e72

18 files changed

+1022
-14
lines changed

llvm/include/llvm/CodeGen/MachineFrameInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ class CalleeSavedInfo {
5353
/// Flag indicating whether the register is spilled to stack or another
5454
/// register.
5555
bool SpilledToReg = false;
56+
/// Flag indicating whether this CSI has been handled by the target and can be
57+
/// skipped by the generic code in the prolog/epilog inserter.
58+
bool IsHandledByTarget = false;
5659

5760
public:
5861
explicit CalleeSavedInfo(MCRegister R, int FI = 0) : Reg(R), FrameIdx(FI) {}
@@ -61,6 +64,7 @@ class CalleeSavedInfo {
6164
MCRegister getReg() const { return Reg; }
6265
int getFrameIdx() const { return FrameIdx; }
6366
MCRegister getDstReg() const { return DstReg; }
67+
void setReg(MCRegister R) { Reg = R; }
6468
void setFrameIdx(int FI) {
6569
FrameIdx = FI;
6670
SpilledToReg = false;
@@ -72,6 +76,9 @@ class CalleeSavedInfo {
7276
bool isRestored() const { return Restored; }
7377
void setRestored(bool R) { Restored = R; }
7478
bool isSpilledToReg() const { return SpilledToReg; }
79+
80+
bool isHandledByTarget() const { return IsHandledByTarget; }
81+
void setHandledByTarget() { IsHandledByTarget = true; }
7582
};
7683

7784
/// The MachineFrameInfo class represents an abstract stack frame until

llvm/lib/CodeGen/PrologEpilogInserter.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,7 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
478478
for (auto &CS : CSI) {
479479
// If the target has spilled this register to another register, we don't
480480
// need to allocate a stack slot.
481-
if (CS.isSpilledToReg())
481+
if (CS.isSpilledToReg() || CS.isHandledByTarget())
482482
continue;
483483

484484
MCRegister Reg = CS.getReg();
@@ -604,6 +604,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
604604
MachineBasicBlock::iterator I = SaveBlock.begin();
605605
if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
606606
for (const CalleeSavedInfo &CS : CSI) {
607+
if (CS.isHandledByTarget())
608+
continue;
607609
// Insert the spill to the stack frame.
608610
MCRegister Reg = CS.getReg();
609611

@@ -634,6 +636,9 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
634636

635637
if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
636638
for (const CalleeSavedInfo &CI : reverse(CSI)) {
639+
if (CI.isHandledByTarget())
640+
continue;
641+
637642
MCRegister Reg = CI.getReg();
638643
if (CI.isSpilledToReg()) {
639644
BuildMI(RestoreBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), Reg)

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1239,6 +1239,14 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
12391239
"v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
12401240
>;
12411241

1242+
// Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and
1243+
// restoring the callee-saved registers.
1244+
def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr",
1245+
"UseBlockVGPROpsForCSR",
1246+
"true",
1247+
"Use block load/store for VGPR callee saved registers"
1248+
>;
1249+
12421250
// Dummy feature used to disable assembler instructions.
12431251
def FeatureDisable : SubtargetFeature<"",
12441252
"FeatureDisable","true",

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "AMDGPUMachineFunction.h"
1919
#include "MCTargetDesc/AMDGPUInstPrinter.h"
2020
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21+
#include "SIMachineFunctionInfo.h"
2122
#include "llvm/CodeGen/MachineBasicBlock.h"
2223
#include "llvm/CodeGen/MachineInstr.h"
2324
#include "llvm/IR/Constants.h"
@@ -239,6 +240,36 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
239240
return AsmPrinter::lowerConstant(CV);
240241
}
241242

243+
static void emitVGPRBlockComment(const MachineInstr *MI, MCStreamer &OS) {
244+
// The instruction will only transfer a subset of the registers in the block,
245+
// based on the mask that is stored in m0. We could search for the instruction
246+
// that sets m0, but most of the time we'll already have the mask stored in
247+
// the machine function info. Try to use that. This assumes that we only use
248+
// block loads/stores for CSR spills.
249+
const MachineFunction *MF = MI->getParent()->getParent();
250+
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
251+
const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
252+
const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
253+
254+
Register RegBlock =
255+
TII->getNamedOperand(*MI, MI->mayLoad() ? AMDGPU::OpName::vdst
256+
: AMDGPU::OpName::vdata)
257+
->getReg();
258+
Register FirstRegInBlock = TRI.getSubReg(RegBlock, AMDGPU::sub0);
259+
uint32_t Mask = MFI->getMaskForVGPRBlockOps(RegBlock);
260+
261+
SmallString<512> TransferredRegs;
262+
for (unsigned I = 0; I < 32; ++I) {
263+
if (Mask & (1 << I)) {
264+
(llvm::Twine(" ") + TRI.getName(FirstRegInBlock + I))
265+
.toVector(TransferredRegs);
266+
}
267+
}
268+
269+
if (!TransferredRegs.empty())
270+
OS.emitRawComment(" transferring at most " + TransferredRegs);
271+
}
272+
242273
void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
243274
// FIXME: Enable feature predicate checks once all the test pass.
244275
// AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
@@ -327,6 +358,10 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
327358
return;
328359
}
329360

361+
if (STI.getInstrInfo()->isBlockLoadStore(MI->getOpcode()))
362+
if (isVerbose())
363+
emitVGPRBlockComment(MI, *OutStreamer);
364+
330365
MCInst TmpInst;
331366
MCInstLowering.lower(MI, TmpInst);
332367
EmitToStreamer(*OutStreamer, TmpInst);

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
254254
bool HasMinimum3Maximum3PKF16 = false;
255255

256256
bool RequiresCOV6 = false;
257+
bool UseBlockVGPROpsForCSR = false;
257258

258259
// Dummy feature to use for assembler in tablegen.
259260
bool FeatureDisable = false;
@@ -1265,6 +1266,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12651266

12661267
bool requiresCodeObjectV6() const { return RequiresCOV6; }
12671268

1269+
bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
1270+
12681271
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
12691272

12701273
bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1694,6 +1694,110 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
16941694
}
16951695
}
16961696

1697+
static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
1698+
const GCNSubtarget &ST,
1699+
const TargetRegisterInfo *TRI,
1700+
std::vector<CalleeSavedInfo> &CSI,
1701+
unsigned &MinCSFrameIndex,
1702+
unsigned &MaxCSFrameIndex) {
1703+
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1704+
MachineFrameInfo &MFI = MF.getFrameInfo();
1705+
const SIInstrInfo *TII = ST.getInstrInfo();
1706+
const SIRegisterInfo *MRI = ST.getRegisterInfo();
1707+
1708+
assert(std::is_sorted(CSI.begin(), CSI.end(),
1709+
[](const CalleeSavedInfo &A, const CalleeSavedInfo &B) {
1710+
return A.getReg() < B.getReg();
1711+
}) &&
1712+
"Callee saved registers not sorted");
1713+
1714+
auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) {
1715+
return !CSI.isSpilledToReg() &&
1716+
MRI->isVGPR(MF.getRegInfo(), CSI.getReg()) &&
1717+
!FuncInfo->isWWMReservedRegister(CSI.getReg());
1718+
};
1719+
1720+
auto CSEnd = CSI.end();
1721+
for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) {
1722+
Register Reg = CSIt->getReg();
1723+
if (!CanUseBlockOps(*CSIt))
1724+
continue;
1725+
1726+
// Find all the regs that will fit in a 32-bit block starting at the current
1727+
// reg and build the mask. It should have 1 for every register that's
1728+
// included, with the current register as the least significant bit.
1729+
uint32_t Mask = 1;
1730+
CSEnd = std::remove_if(
1731+
CSIt + 1, CSEnd, [&](const CalleeSavedInfo &CSI) -> bool {
1732+
if (CanUseBlockOps(CSI) && CSI.getReg() < Reg + 32) {
1733+
Mask |= 1 << (CSI.getReg() - Reg);
1734+
return true;
1735+
} else {
1736+
return false;
1737+
}
1738+
});
1739+
1740+
const TargetRegisterClass *BlockRegClass =
1741+
TII->getRegClassForBlockOp(TRI, MF);
1742+
Register RegBlock =
1743+
MRI->getMatchingSuperReg(Reg, AMDGPU::sub0, BlockRegClass);
1744+
if (!RegBlock) {
1745+
// We couldn't find a super register for the block. This can happen if
1746+
// the register we started with is too high (e.g. v232 if the maximum is
1747+
// v255). We therefore try to get the last register block and figure out
1748+
// the mask from there.
1749+
Register LastBlockStart =
1750+
AMDGPU::VGPR0 + alignDown(Reg - AMDGPU::VGPR0, 32);
1751+
RegBlock =
1752+
MRI->getMatchingSuperReg(LastBlockStart, AMDGPU::sub0, BlockRegClass);
1753+
assert(RegBlock && MRI->isSubRegister(RegBlock, Reg) &&
1754+
"Couldn't find super register");
1755+
int RegDelta = Reg - LastBlockStart;
1756+
assert(RegDelta > 0 && llvm::countl_zero(Mask) >= RegDelta &&
1757+
"Bad shift amount");
1758+
Mask <<= RegDelta;
1759+
}
1760+
1761+
FuncInfo->setMaskForVGPRBlockOps(RegBlock, Mask);
1762+
1763+
// The stack objects can be a bit smaller than the register block if we know
1764+
// some of the high bits of Mask are 0. This may happen often with calling
1765+
// conventions where the caller and callee-saved VGPRs are interleaved at
1766+
// a small boundary (e.g. 8 or 16).
1767+
int UnusedBits = llvm::countl_zero(Mask);
1768+
unsigned BlockSize = MRI->getSpillSize(*BlockRegClass) - UnusedBits * 4;
1769+
int FrameIdx =
1770+
MFI.CreateStackObject(BlockSize, MRI->getSpillAlign(*BlockRegClass),
1771+
/*isSpillSlot=*/true);
1772+
if ((unsigned)FrameIdx < MinCSFrameIndex)
1773+
MinCSFrameIndex = FrameIdx;
1774+
if ((unsigned)FrameIdx > MaxCSFrameIndex)
1775+
MaxCSFrameIndex = FrameIdx;
1776+
1777+
CSIt->setFrameIdx(FrameIdx);
1778+
CSIt->setReg(RegBlock);
1779+
CSIt->setHandledByTarget();
1780+
}
1781+
CSI.erase(CSEnd, CSI.end());
1782+
}
1783+
1784+
bool SIFrameLowering::assignCalleeSavedSpillSlots(
1785+
MachineFunction &MF, const TargetRegisterInfo *TRI,
1786+
std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
1787+
unsigned &MaxCSFrameIndex) const {
1788+
if (CSI.empty())
1789+
return true; // Early exit if no callee saved registers are modified!
1790+
1791+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1792+
bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();
1793+
1794+
if (UseVGPRBlocks)
1795+
assignSlotsUsingVGPRBlocks(MF, ST, TRI, CSI, MinCSFrameIndex,
1796+
MaxCSFrameIndex);
1797+
1798+
return assignCalleeSavedSpillSlots(MF, TRI, CSI);
1799+
}
1800+
16971801
bool SIFrameLowering::assignCalleeSavedSpillSlots(
16981802
MachineFunction &MF, const TargetRegisterInfo *TRI,
16991803
std::vector<CalleeSavedInfo> &CSI) const {
@@ -1763,6 +1867,101 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
17631867
return true;
17641868
}
17651869

1870+
bool SIFrameLowering::spillCalleeSavedRegisters(
1871+
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
1872+
ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
1873+
MachineFunction *MF = MBB.getParent();
1874+
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1875+
if (!ST.useVGPRBlockOpsForCSR())
1876+
return false;
1877+
1878+
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1879+
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1880+
const SIInstrInfo *TII = ST.getInstrInfo();
1881+
SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1882+
1883+
for (const CalleeSavedInfo &CS : CSI) {
1884+
Register Reg = CS.getReg();
1885+
if (!CS.isHandledByTarget())
1886+
continue;
1887+
1888+
// Build a scratch block store.
1889+
uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg);
1890+
int FrameIndex = CS.getFrameIdx();
1891+
MachinePointerInfo PtrInfo =
1892+
MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1893+
MachineMemOperand *MMO =
1894+
MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
1895+
FrameInfo.getObjectSize(FrameIndex),
1896+
FrameInfo.getObjectAlign(FrameIndex));
1897+
1898+
BuildMI(MBB, MI, MI->getDebugLoc(),
1899+
TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_SAVE))
1900+
.addReg(Reg, getKillRegState(false))
1901+
.addFrameIndex(FrameIndex)
1902+
.addReg(MFI->getStackPtrOffsetReg())
1903+
.addImm(0)
1904+
.addImm(Mask)
1905+
.addMemOperand(MMO);
1906+
1907+
FuncInfo->setHasSpilledVGPRs();
1908+
1909+
// Add the register to the liveins. This is necessary because if any of the
1910+
// VGPRs in the register block is reserved (e.g. if it's a WWM register),
1911+
// then the whole block will be marked as reserved and `updateLiveness` will
1912+
// skip it.
1913+
MBB.addLiveIn(Reg);
1914+
}
1915+
1916+
return false;
1917+
}
1918+
1919+
bool SIFrameLowering::restoreCalleeSavedRegisters(
1920+
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
1921+
MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
1922+
MachineFunction *MF = MBB.getParent();
1923+
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1924+
if (!ST.useVGPRBlockOpsForCSR())
1925+
return false;
1926+
1927+
SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1928+
MachineFrameInfo &MFI = MF->getFrameInfo();
1929+
const SIInstrInfo *TII = ST.getInstrInfo();
1930+
const SIRegisterInfo *SITRI = static_cast<const SIRegisterInfo *>(TRI);
1931+
for (const CalleeSavedInfo &CS : reverse(CSI)) {
1932+
if (!CS.isHandledByTarget())
1933+
continue;
1934+
1935+
// Build a scratch block load.
1936+
Register Reg = CS.getReg();
1937+
uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg);
1938+
int FrameIndex = CS.getFrameIdx();
1939+
MachinePointerInfo PtrInfo =
1940+
MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1941+
MachineMemOperand *MMO = MF->getMachineMemOperand(
1942+
PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
1943+
MFI.getObjectAlign(FrameIndex));
1944+
1945+
auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
1946+
TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), Reg)
1947+
.addFrameIndex(FrameIndex)
1948+
.addReg(FuncInfo->getStackPtrOffsetReg())
1949+
.addImm(0)
1950+
.addImm(Mask)
1951+
.addMemOperand(MMO);
1952+
SITRI->addImplicitUsesForBlockCSRLoad(MIB, Reg);
1953+
1954+
// Add the register to the liveins. This is necessary because if any of the
1955+
// VGPRs in the register block is reserved (e.g. if it's a WWM register),
1956+
// then the whole block will be marked as reserved and `updateLiveness` will
1957+
// skip it.
1958+
if (!MBB.isLiveIn(Reg))
1959+
MBB.addLiveIn(Reg);
1960+
}
1961+
1962+
return false;
1963+
}
1964+
17661965
MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
17671966
MachineFunction &MF,
17681967
MachineBasicBlock &MBB,

llvm/lib/Target/AMDGPU/SIFrameLowering.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,23 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
4949
const TargetRegisterInfo *TRI,
5050
std::vector<CalleeSavedInfo> &CSI) const override;
5151

52+
bool assignCalleeSavedSpillSlots(MachineFunction &MF,
53+
const TargetRegisterInfo *TRI,
54+
std::vector<CalleeSavedInfo> &CSI,
55+
unsigned &MinCSFrameIndex,
56+
unsigned &MaxCSFrameIndex) const override;
57+
58+
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
59+
MachineBasicBlock::iterator MI,
60+
ArrayRef<CalleeSavedInfo> CSI,
61+
const TargetRegisterInfo *TRI) const override;
62+
63+
bool
64+
restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
65+
MachineBasicBlock::iterator MI,
66+
MutableArrayRef<CalleeSavedInfo> CSI,
67+
const TargetRegisterInfo *TRI) const override;
68+
5269
bool allocateScavengingFrameIndexesNearIncomingSP(
5370
const MachineFunction &MF) const override;
5471

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5831,6 +5831,16 @@ const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
58315831
IsAllocatable);
58325832
}
58335833

5834+
const TargetRegisterClass *
5835+
SIInstrInfo::getRegClassForBlockOp(const TargetRegisterInfo *TRI,
5836+
const MachineFunction &MF) const {
5837+
const MCInstrDesc &ScratchStoreBlockOp =
5838+
get(AMDGPU::SCRATCH_STORE_BLOCK_SADDR);
5839+
int VDataIdx = AMDGPU::getNamedOperandIdx(ScratchStoreBlockOp.getOpcode(),
5840+
AMDGPU::OpName::vdata);
5841+
return getRegClass(ScratchStoreBlockOp, VDataIdx, TRI, MF);
5842+
}
5843+
58345844
const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
58355845
unsigned OpNo) const {
58365846
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();

0 commit comments

Comments
 (0)