-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Implement CFI for non-kernel functions #164723
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/slinder1/amdgpu-cfi-3
Are you sure you want to change the base?
Conversation
|
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-debuginfo @llvm/pr-subscribers-backend-amdgpu Author: Scott Linder (slinder1) ChangesThis does not implement CSR spills other than those AMDGPU handles Co-authored-by: Scott Linder <[email protected]> Patch is 3.42 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164723.diff 88 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 71356aa2aced1..5a0b1afbdfdff 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -17,6 +17,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/LEB128.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -29,6 +30,10 @@ static cl::opt<bool> EnableSpillVGPRToAGPR(
cl::ReallyHidden,
cl::init(true));
+static constexpr unsigned SGPRBitSize = 32;
+static constexpr unsigned SGPRByteSize = SGPRBitSize / 8;
+static constexpr unsigned VGPRLaneBitSize = 32;
+
// Find a register matching \p RC from \p LiveUnits which is unused and
// available throughout the function. On failure, returns AMDGPU::NoRegister.
// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
@@ -54,6 +59,72 @@ static bool needsFrameMoves(const MachineFunction &MF) {
return true;
}
+static void encodeDwarfRegisterLocation(int DwarfReg, raw_ostream &OS) {
+ assert(DwarfReg >= 0);
+ if (DwarfReg < 32) {
+ OS << uint8_t(dwarf::DW_OP_reg0 + DwarfReg);
+ } else {
+ OS << uint8_t(dwarf::DW_OP_regx);
+ encodeULEB128(DwarfReg, OS);
+ }
+}
+
+static MCCFIInstruction
+createScaledCFAInPrivateWave(const GCNSubtarget &ST,
+ MCRegister DwarfStackPtrReg) {
+ assert(ST.enableFlatScratch());
+
+ // When flat scratch is enabled, the stack pointer is an address in the
+ // private_lane DWARF address space (i.e. swizzled), but in order to
+ // accurately and efficiently describe things like masked spills of vector
+ // registers we want to define the CFA to be an address in the private_wave
+ // DWARF address space (i.e. unswizzled). To achieve this we scale the stack
+ // pointer by the wavefront size, implemented as (SP << wave_size_log2).
+ const unsigned WavefrontSizeLog2 = ST.getWavefrontSizeLog2();
+ assert(WavefrontSizeLog2 < 32);
+
+ SmallString<20> Block;
+ raw_svector_ostream OSBlock(Block);
+ encodeDwarfRegisterLocation(DwarfStackPtrReg, OSBlock);
+ OSBlock << uint8_t(dwarf::DW_OP_deref_size) << uint8_t(SGPRByteSize)
+ << uint8_t(dwarf::DW_OP_lit0 + WavefrontSizeLog2)
+ << uint8_t(dwarf::DW_OP_shl)
+ << uint8_t(dwarf::DW_OP_lit0 +
+ dwarf::DW_ASPACE_LLVM_AMDGPU_private_wave)
+ << uint8_t(dwarf::DW_OP_LLVM_user)
+ << uint8_t(dwarf::DW_OP_LLVM_form_aspace_address);
+
+ SmallString<20> CFIInst;
+ raw_svector_ostream OSCFIInst(CFIInst);
+ OSCFIInst << uint8_t(dwarf::DW_CFA_def_cfa_expression);
+ encodeULEB128(Block.size(), OSCFIInst);
+ OSCFIInst << Block;
+
+ return MCCFIInstruction::createEscape(nullptr, OSCFIInst.str());
+}
+
+void SIFrameLowering::emitDefCFA(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc const &DL, Register StackPtrReg,
+ bool AspaceAlreadyDefined,
+ MachineInstr::MIFlag Flags) const {
+ MachineFunction &MF = *MBB.getParent();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const MCRegisterInfo *MCRI = MF.getContext().getRegisterInfo();
+
+ MCRegister DwarfStackPtrReg = MCRI->getDwarfRegNum(StackPtrReg, false);
+ MCCFIInstruction CFIInst =
+ ST.enableFlatScratch()
+ ? createScaledCFAInPrivateWave(ST, DwarfStackPtrReg)
+ : (AspaceAlreadyDefined
+ ? MCCFIInstruction::createLLVMDefAspaceCfa(
+ nullptr, DwarfStackPtrReg, 0,
+ dwarf::DW_ASPACE_LLVM_AMDGPU_private_wave, SMLoc())
+ : MCCFIInstruction::createDefCfaRegister(nullptr,
+ DwarfStackPtrReg));
+ buildCFI(MBB, MBBI, DL, CFIInst, Flags);
+}
+
// Find a scratch register that we can use in the prologue. We avoid using
// callee-save registers since they may appear to be free when this is called
// from canUseAsPrologue (during shrink wrapping), but then no longer be free
@@ -242,6 +313,8 @@ class PrologEpilogSGPRSpillBuilder {
SIMachineFunctionInfo *FuncInfo;
const SIInstrInfo *TII;
const SIRegisterInfo &TRI;
+ const MCRegisterInfo *MCRI;
+ const SIFrameLowering *TFI;
Register SuperReg;
const PrologEpilogSGPRSaveRestoreInfo SI;
LiveRegUnits &LiveUnits;
@@ -250,9 +323,16 @@ class PrologEpilogSGPRSpillBuilder {
ArrayRef<int16_t> SplitParts;
unsigned NumSubRegs;
unsigned EltSize = 4;
+ bool IsFramePtrPrologSpill;
+ bool NeedsFrameMoves;
+
+ bool isExec(Register Reg) const {
+ return Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::EXEC;
+ }
void saveToMemory(const int FI) const {
MachineRegisterInfo &MRI = MF.getRegInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
assert(!MFI.isDeadObjectIndex(FI));
initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
@@ -271,6 +351,20 @@ class PrologEpilogSGPRSpillBuilder {
buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR,
FI, FrameReg, DwordOff);
+ if (NeedsFrameMoves) {
+ if (isExec(SuperReg) && (I == NumSubRegs - 1))
+ SubReg = AMDGPU::EXEC;
+ else if (IsFramePtrPrologSpill)
+ SubReg = FuncInfo->getFrameOffsetReg();
+
+ // FIXME: CFI for EXEC needs a fix by accurately computing the spill
+ // offset for both the low and high components.
+ if (SubReg != AMDGPU::EXEC_LO)
+ TFI->buildCFI(MBB, MI, DL,
+ MCCFIInstruction::createOffset(
+ nullptr, MCRI->getDwarfRegNum(SubReg, false),
+ MFI.getObjectOffset(FI) * ST.getWavefrontSize()));
+ }
DwordOff += 4;
}
}
@@ -292,6 +386,19 @@ class PrologEpilogSGPRSpillBuilder {
.addReg(SubReg)
.addImm(Spill[I].Lane)
.addReg(Spill[I].VGPR, RegState::Undef);
+ if (NeedsFrameMoves) {
+ if (isExec(SuperReg)) {
+ if (I == NumSubRegs - 1)
+ TFI->buildCFIForSGPRToVGPRSpill(MBB, MI, DL, AMDGPU::EXEC, Spill);
+ } else if (IsFramePtrPrologSpill) {
+ TFI->buildCFIForSGPRToVGPRSpill(MBB, MI, DL,
+ FuncInfo->getFrameOffsetReg(),
+ Spill[I].VGPR, Spill[I].Lane);
+ } else {
+ TFI->buildCFIForSGPRToVGPRSpill(MBB, MI, DL, SubReg, Spill[I].VGPR,
+ Spill[I].Lane);
+ }
+ }
}
}
@@ -299,10 +406,35 @@ class PrologEpilogSGPRSpillBuilder {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
.addReg(SuperReg)
.setMIFlag(MachineInstr::FrameSetup);
+ if (NeedsFrameMoves) {
+ const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(DstReg);
+ ArrayRef<int16_t> DstSplitParts = TRI.getRegSplitParts(RC, EltSize);
+ unsigned DstNumSubRegs = DstSplitParts.empty() ? 1 : DstSplitParts.size();
+ assert(NumSubRegs == DstNumSubRegs);
+ for (unsigned I = 0; I < NumSubRegs; ++I) {
+ Register SrcSubReg =
+ NumSubRegs == 1 ? SuperReg
+ : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
+ Register DstSubReg =
+ NumSubRegs == 1 ? DstReg
+ : Register(TRI.getSubReg(DstReg, DstSplitParts[I]));
+ if (isExec(SuperReg)) {
+ if (I == NumSubRegs - 1)
+ TFI->buildCFIForRegToSGPRPairSpill(MBB, MI, DL, AMDGPU::EXEC,
+ DstReg);
+ } else {
+ TFI->buildCFI(MBB, MI, DL,
+ MCCFIInstruction::createRegister(
+ nullptr, MCRI->getDwarfRegNum(SrcSubReg, false),
+ MCRI->getDwarfRegNum(DstSubReg, false)));
+ }
+ }
+ }
}
void restoreFromMemory(const int FI) {
MachineRegisterInfo &MRI = MF.getRegInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
@@ -354,12 +486,15 @@ class PrologEpilogSGPRSpillBuilder {
MachineBasicBlock::iterator MI,
const DebugLoc &DL, const SIInstrInfo *TII,
const SIRegisterInfo &TRI,
- LiveRegUnits &LiveUnits, Register FrameReg)
+ LiveRegUnits &LiveUnits, Register FrameReg,
+ bool IsFramePtrPrologSpill = false)
: MI(MI), MBB(MBB), MF(*MBB.getParent()),
ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
- SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
- FrameReg(FrameReg) {
+ MCRI(MF.getContext().getRegisterInfo()), TFI(ST.getFrameLowering()),
+ SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL), FrameReg(FrameReg),
+ IsFramePtrPrologSpill(IsFramePtrPrologSpill),
+ NeedsFrameMoves(needsFrameMoves(MF)) {
const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
SplitParts = TRI.getRegSplitParts(RC, EltSize);
NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
@@ -967,6 +1102,50 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
llvm_unreachable("Invalid TargetStackID::Value");
}
+void SIFrameLowering::emitPrologueEntryCFI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL) const {
+ const MachineFunction &MF = *MBB.getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const MCRegisterInfo *MCRI = MF.getContext().getRegisterInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
+ Register StackPtrReg =
+ MF.getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg();
+
+ emitDefCFA(MBB, MBBI, DL, StackPtrReg, /*AspaceAlreadyDefined=*/true,
+ MachineInstr::FrameSetup);
+
+ buildCFIForRegToSGPRPairSpill(MBB, MBBI, DL, AMDGPU::PC_REG,
+ TRI.getReturnAddressReg(MF));
+
+ BitVector IsCalleeSaved(TRI.getNumRegs());
+ const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+ for (unsigned I = 0; CSRegs[I]; ++I) {
+ IsCalleeSaved.set(CSRegs[I]);
+ }
+ auto ProcessReg = [&](MCPhysReg Reg) {
+ if (IsCalleeSaved.test(Reg) || !MRI.isPhysRegModified(Reg))
+ return;
+ MCRegister DwarfReg = MCRI->getDwarfRegNum(Reg, false);
+ buildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createUndefined(nullptr, DwarfReg));
+ };
+
+ // Emit CFI rules for caller saved Arch VGPRs which are clobbered
+ unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
+ for_each(AMDGPU::VGPR_32RegClass.getRegisters().take_front(NumArchVGPRs),
+ ProcessReg);
+
+ // Emit CFI rules for caller saved Accum VGPRs which are clobbered
+ if (ST.hasMAIInsts()) {
+ for_each(AMDGPU::AGPR_32RegClass.getRegisters(), ProcessReg);
+ }
+
+ // Emit CFI rules for caller saved SGPRs which are clobbered
+ for_each(AMDGPU::SGPR_32RegClass.getRegisters(), ProcessReg);
+}
+
// Activate only the inactive lanes when \p EnableInactiveLanes is true.
// Otherwise, activate all lanes. It returns the saved exec.
static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
@@ -1013,14 +1192,19 @@ static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
return ScratchExecCopy;
}
-void SIFrameLowering::emitCSRSpillStores(
- MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
- Register FrameReg, Register FramePtrRegScratchCopy) const {
+void SIFrameLowering::emitCSRSpillStores(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc &DL, LiveRegUnits &LiveUnits,
+ Register FrameReg,
+ Register FramePtrRegScratchCopy,
+ const bool NeedsFrameMoves) const {
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ const MCRegisterInfo *MCRI = MF.getContext().getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
@@ -1042,6 +1226,12 @@ void SIFrameLowering::emitCSRSpillStores(
int FI = Reg.second;
buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
VGPR, FI, FrameReg);
+ if (NeedsFrameMoves)
+ // We spill the entire VGPR, so we can get away with just cfi_offset
+ buildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createOffset(
+ nullptr, MCRI->getDwarfRegNum(VGPR, false),
+ MFI.getObjectOffset(FI) * ST.getWavefrontSize()));
}
};
@@ -1090,13 +1280,13 @@ void SIFrameLowering::emitCSRSpillStores(
// Skip if FP is saved to a scratch SGPR, the save has already been emitted.
// Otherwise, FP has been moved to a temporary register and spill it
// instead.
- Register Reg =
- Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
+ bool IsFramePtrPrologSpill = Spill.first == FramePtrReg ? true : false;
+ Register Reg = IsFramePtrPrologSpill ? FramePtrRegScratchCopy : Spill.first;
if (!Reg)
continue;
PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
- LiveUnits, FrameReg);
+ LiveUnits, FrameReg, IsFramePtrPrologSpill);
SB.save();
}
@@ -1264,6 +1454,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
uint32_t NumBytes = MFI.getStackSize();
uint32_t RoundedSize = NumBytes;
+ const bool NeedsFrameMoves = needsFrameMoves(MF);
+
+ if (NeedsFrameMoves)
+ emitPrologueEntryCFI(MBB, MBBI, DL);
+
if (TRI.hasStackRealignment(MF))
HasFP = true;
@@ -1272,7 +1467,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
// Emit the CSR spill stores with SP base register.
emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
FuncInfo->isChainFunction() ? Register() : StackPtrReg,
- FramePtrRegScratchCopy);
+ FramePtrRegScratchCopy, NeedsFrameMoves);
} else {
// CSR spill stores will use FP as base register.
Register SGPRForFPSaveRestoreCopy =
@@ -1286,7 +1481,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
PrologEpilogSGPRSpillBuilder SB(
FramePtrReg,
FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
- DL, TII, TRI, LiveUnits, FramePtrReg);
+ DL, TII, TRI, LiveUnits, FramePtrReg,
+ /*IsFramePtrPrologSpill*/ true);
SB.save();
LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
} else {
@@ -1333,7 +1529,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
// If FP is used, emit the CSR spills with FP base register.
if (HasFP) {
emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
- FramePtrRegScratchCopy);
+ FramePtrRegScratchCopy, NeedsFrameMoves);
if (FramePtrRegScratchCopy)
LiveUnits.removeReg(FramePtrRegScratchCopy);
}
@@ -1348,6 +1544,12 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
+ if (HasFP) {
+ if (NeedsFrameMoves)
+ emitDefCFA(MBB, MBBI, DL, FramePtrReg, /*AspaceAlreadyDefined=*/false,
+ MachineInstr::FrameSetup);
+ }
+
if (HasFP && RoundedSize != 0) {
auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
.addReg(StackPtrReg)
@@ -1447,6 +1649,13 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
FramePtrRegScratchCopy);
}
+ const bool NeedsFrameMoves = needsFrameMoves(MF);
+ if (hasFP(MF)) {
+ if (NeedsFrameMoves)
+ emitDefCFA(MBB, MBBI, DL, StackPtrReg, /*AspaceAlreadyDefined=*/false,
+ MachineInstr::FrameDestroy);
+ }
+
if (FPSaved) {
// Insert the copy to restore FP.
Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
@@ -2257,3 +2466,72 @@ MachineInstr *SIFrameLowering::buildCFI(MachineBasicBlock &MBB,
.addCFIIndex(MF.addFrameInst(CFIInst))
.setMIFlag(flag);
}
+
+MachineInstr *SIFrameLowering::buildCFIForSGPRToVGPRSpill(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, const Register SGPR, const Register VGPR,
+ const int Lane) const {
+ const MachineFunction &MF = *MBB.getParent();
+ const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
+
+ int DwarfSGPR = MCRI.getDwarfRegNum(SGPR, false);
+ int DwarfVGPR = MCRI.getDwarfRegNum(VGPR, false);
+ assert(DwarfSGPR != -1 && DwarfVGPR != -1);
+ assert(Lane != -1 && "Expected a lane to be present");
+
+ // Build a CFI instruction that represents a SGPR spilled to a single lane of
+ // a VGPR.
+ MCCFIInstruction::VectorRegisterWithLane VR{unsigned(DwarfVGPR),
+ unsigned(Lane), VGPRLaneBitSize};
+ auto CFIInst =
+ MCCFIInstruction::createLLVMVectorRegisters(nullptr, DwarfSGPR, {VR});
+ return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
+}
+
+MachineInstr *SIFrameLowering::buildCFIForSGPRToVGPRSpill(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, Register SGPR,
+ ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills) const {
+ const MachineFunction &MF = *MBB.getParent();
+ const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
+
+ int DwarfSGPR = MCRI.getDwarfRegNum(SGPR, false);
+ assert(DwarfSGPR != -1);
+
+ // Build a CFI instruction that represents a SGPR spilled to multiple lanes of
+ // multiple VGPRs.
+
+ std::vector<MCCFIInstruction::VectorRegisterWithLane> VGPRs;
+ for (SIRegisterInfo::SpilledReg Spill : VGPRSpills) {
+ int DwarfVGPR = MCRI.getDwarfRegNum(Spill.VGPR, false);
+ assert(DwarfVGPR != -1);
+ assert(Spill.hasLane() && "Expected a lane to be present");
+ VGPRs.push_back(
+ {unsigned(DwarfVGPR), unsigned(Spill.Lane), VGPRLaneBitSize});
+ }
+
+ auto CFIInst = MCCFIInstruction::createLLVMVectorRegisters(nullptr, DwarfSGPR,
+ std::move(VGPRs));
+ return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
+}
+
+MachineInstr *SIFrameLowering::buildCFIForRegToSGPRPairSpill(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, const Register Reg, const Register SGPRPair) const {
+ const MachineFunction &MF = *MBB.getParent();
+ const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
+
+ int SGPR0 = TRI.getSubReg(SGPRPair, AMDGPU::sub0);
+ int SGPR1 = TRI.getSubReg(SGPRPair, AMDGPU::sub1);
+
+ int DwarfReg = MCRI.getDwarfRegNum(Reg, false);
+ int DwarfSGPR0 = MCRI.getDwarfRegNum(SGPR0, false);
+ int DwarfSGPR1 = MCRI.getDwarfRegNum(SGPR1, false);
+ assert(DwarfReg != -1 && DwarfSGPR0 != 1 && DwarfSGPR1 != 1);
+
+ auto CFIInst = MCCFIInstruction::createLLVMRegisterPair(
+ nullptr, DwarfReg, DwarfSGPR0, SGPRBitSize, DwarfSGPR1, SGPRBitSize);
+ return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
+}
diff --git a/llvm/lib/T...
[truncated]
|
|
@llvm/pr-subscribers-llvm-globalisel Author: Scott Linder (slinder1) ChangesThis does not implement CSR spills other than those AMDGPU handles Co-authored-by: Scott Linder <[email protected]> Patch is 3.42 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164723.diff 88 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 71356aa2aced1..5a0b1afbdfdff 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -17,6 +17,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/LEB128.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -29,6 +30,10 @@ static cl::opt<bool> EnableSpillVGPRToAGPR(
cl::ReallyHidden,
cl::init(true));
+static constexpr unsigned SGPRBitSize = 32;
+static constexpr unsigned SGPRByteSize = SGPRBitSize / 8;
+static constexpr unsigned VGPRLaneBitSize = 32;
+
// Find a register matching \p RC from \p LiveUnits which is unused and
// available throughout the function. On failure, returns AMDGPU::NoRegister.
// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
@@ -54,6 +59,72 @@ static bool needsFrameMoves(const MachineFunction &MF) {
return true;
}
+static void encodeDwarfRegisterLocation(int DwarfReg, raw_ostream &OS) {
+ assert(DwarfReg >= 0);
+ if (DwarfReg < 32) {
+ OS << uint8_t(dwarf::DW_OP_reg0 + DwarfReg);
+ } else {
+ OS << uint8_t(dwarf::DW_OP_regx);
+ encodeULEB128(DwarfReg, OS);
+ }
+}
+
+static MCCFIInstruction
+createScaledCFAInPrivateWave(const GCNSubtarget &ST,
+ MCRegister DwarfStackPtrReg) {
+ assert(ST.enableFlatScratch());
+
+ // When flat scratch is enabled, the stack pointer is an address in the
+ // private_lane DWARF address space (i.e. swizzled), but in order to
+ // accurately and efficiently describe things like masked spills of vector
+ // registers we want to define the CFA to be an address in the private_wave
+ // DWARF address space (i.e. unswizzled). To achieve this we scale the stack
+ // pointer by the wavefront size, implemented as (SP << wave_size_log2).
+ const unsigned WavefrontSizeLog2 = ST.getWavefrontSizeLog2();
+ assert(WavefrontSizeLog2 < 32);
+
+ SmallString<20> Block;
+ raw_svector_ostream OSBlock(Block);
+ encodeDwarfRegisterLocation(DwarfStackPtrReg, OSBlock);
+ OSBlock << uint8_t(dwarf::DW_OP_deref_size) << uint8_t(SGPRByteSize)
+ << uint8_t(dwarf::DW_OP_lit0 + WavefrontSizeLog2)
+ << uint8_t(dwarf::DW_OP_shl)
+ << uint8_t(dwarf::DW_OP_lit0 +
+ dwarf::DW_ASPACE_LLVM_AMDGPU_private_wave)
+ << uint8_t(dwarf::DW_OP_LLVM_user)
+ << uint8_t(dwarf::DW_OP_LLVM_form_aspace_address);
+
+ SmallString<20> CFIInst;
+ raw_svector_ostream OSCFIInst(CFIInst);
+ OSCFIInst << uint8_t(dwarf::DW_CFA_def_cfa_expression);
+ encodeULEB128(Block.size(), OSCFIInst);
+ OSCFIInst << Block;
+
+ return MCCFIInstruction::createEscape(nullptr, OSCFIInst.str());
+}
+
+void SIFrameLowering::emitDefCFA(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc const &DL, Register StackPtrReg,
+ bool AspaceAlreadyDefined,
+ MachineInstr::MIFlag Flags) const {
+ MachineFunction &MF = *MBB.getParent();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const MCRegisterInfo *MCRI = MF.getContext().getRegisterInfo();
+
+ MCRegister DwarfStackPtrReg = MCRI->getDwarfRegNum(StackPtrReg, false);
+ MCCFIInstruction CFIInst =
+ ST.enableFlatScratch()
+ ? createScaledCFAInPrivateWave(ST, DwarfStackPtrReg)
+ : (AspaceAlreadyDefined
+ ? MCCFIInstruction::createLLVMDefAspaceCfa(
+ nullptr, DwarfStackPtrReg, 0,
+ dwarf::DW_ASPACE_LLVM_AMDGPU_private_wave, SMLoc())
+ : MCCFIInstruction::createDefCfaRegister(nullptr,
+ DwarfStackPtrReg));
+ buildCFI(MBB, MBBI, DL, CFIInst, Flags);
+}
+
// Find a scratch register that we can use in the prologue. We avoid using
// callee-save registers since they may appear to be free when this is called
// from canUseAsPrologue (during shrink wrapping), but then no longer be free
@@ -242,6 +313,8 @@ class PrologEpilogSGPRSpillBuilder {
SIMachineFunctionInfo *FuncInfo;
const SIInstrInfo *TII;
const SIRegisterInfo &TRI;
+ const MCRegisterInfo *MCRI;
+ const SIFrameLowering *TFI;
Register SuperReg;
const PrologEpilogSGPRSaveRestoreInfo SI;
LiveRegUnits &LiveUnits;
@@ -250,9 +323,16 @@ class PrologEpilogSGPRSpillBuilder {
ArrayRef<int16_t> SplitParts;
unsigned NumSubRegs;
unsigned EltSize = 4;
+ bool IsFramePtrPrologSpill;
+ bool NeedsFrameMoves;
+
+ bool isExec(Register Reg) const {
+ return Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::EXEC;
+ }
void saveToMemory(const int FI) const {
MachineRegisterInfo &MRI = MF.getRegInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
assert(!MFI.isDeadObjectIndex(FI));
initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
@@ -271,6 +351,20 @@ class PrologEpilogSGPRSpillBuilder {
buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR,
FI, FrameReg, DwordOff);
+ if (NeedsFrameMoves) {
+ if (isExec(SuperReg) && (I == NumSubRegs - 1))
+ SubReg = AMDGPU::EXEC;
+ else if (IsFramePtrPrologSpill)
+ SubReg = FuncInfo->getFrameOffsetReg();
+
+ // FIXME: CFI for EXEC needs a fix by accurately computing the spill
+ // offset for both the low and high components.
+ if (SubReg != AMDGPU::EXEC_LO)
+ TFI->buildCFI(MBB, MI, DL,
+ MCCFIInstruction::createOffset(
+ nullptr, MCRI->getDwarfRegNum(SubReg, false),
+ MFI.getObjectOffset(FI) * ST.getWavefrontSize()));
+ }
DwordOff += 4;
}
}
@@ -292,6 +386,19 @@ class PrologEpilogSGPRSpillBuilder {
.addReg(SubReg)
.addImm(Spill[I].Lane)
.addReg(Spill[I].VGPR, RegState::Undef);
+ if (NeedsFrameMoves) {
+ if (isExec(SuperReg)) {
+ if (I == NumSubRegs - 1)
+ TFI->buildCFIForSGPRToVGPRSpill(MBB, MI, DL, AMDGPU::EXEC, Spill);
+ } else if (IsFramePtrPrologSpill) {
+ TFI->buildCFIForSGPRToVGPRSpill(MBB, MI, DL,
+ FuncInfo->getFrameOffsetReg(),
+ Spill[I].VGPR, Spill[I].Lane);
+ } else {
+ TFI->buildCFIForSGPRToVGPRSpill(MBB, MI, DL, SubReg, Spill[I].VGPR,
+ Spill[I].Lane);
+ }
+ }
}
}
@@ -299,10 +406,35 @@ class PrologEpilogSGPRSpillBuilder {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
.addReg(SuperReg)
.setMIFlag(MachineInstr::FrameSetup);
+ if (NeedsFrameMoves) {
+ const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(DstReg);
+ ArrayRef<int16_t> DstSplitParts = TRI.getRegSplitParts(RC, EltSize);
+ unsigned DstNumSubRegs = DstSplitParts.empty() ? 1 : DstSplitParts.size();
+ assert(NumSubRegs == DstNumSubRegs);
+ for (unsigned I = 0; I < NumSubRegs; ++I) {
+ Register SrcSubReg =
+ NumSubRegs == 1 ? SuperReg
+ : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
+ Register DstSubReg =
+ NumSubRegs == 1 ? DstReg
+ : Register(TRI.getSubReg(DstReg, DstSplitParts[I]));
+ if (isExec(SuperReg)) {
+ if (I == NumSubRegs - 1)
+ TFI->buildCFIForRegToSGPRPairSpill(MBB, MI, DL, AMDGPU::EXEC,
+ DstReg);
+ } else {
+ TFI->buildCFI(MBB, MI, DL,
+ MCCFIInstruction::createRegister(
+ nullptr, MCRI->getDwarfRegNum(SrcSubReg, false),
+ MCRI->getDwarfRegNum(DstSubReg, false)));
+ }
+ }
+ }
}
void restoreFromMemory(const int FI) {
MachineRegisterInfo &MRI = MF.getRegInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
@@ -354,12 +486,15 @@ class PrologEpilogSGPRSpillBuilder {
MachineBasicBlock::iterator MI,
const DebugLoc &DL, const SIInstrInfo *TII,
const SIRegisterInfo &TRI,
- LiveRegUnits &LiveUnits, Register FrameReg)
+ LiveRegUnits &LiveUnits, Register FrameReg,
+ bool IsFramePtrPrologSpill = false)
: MI(MI), MBB(MBB), MF(*MBB.getParent()),
ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
- SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
- FrameReg(FrameReg) {
+ MCRI(MF.getContext().getRegisterInfo()), TFI(ST.getFrameLowering()),
+ SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL), FrameReg(FrameReg),
+ IsFramePtrPrologSpill(IsFramePtrPrologSpill),
+ NeedsFrameMoves(needsFrameMoves(MF)) {
const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
SplitParts = TRI.getRegSplitParts(RC, EltSize);
NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
@@ -967,6 +1102,50 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
llvm_unreachable("Invalid TargetStackID::Value");
}
+void SIFrameLowering::emitPrologueEntryCFI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL) const {
+ const MachineFunction &MF = *MBB.getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const MCRegisterInfo *MCRI = MF.getContext().getRegisterInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
+ Register StackPtrReg =
+ MF.getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg();
+
+ emitDefCFA(MBB, MBBI, DL, StackPtrReg, /*AspaceAlreadyDefined=*/true,
+ MachineInstr::FrameSetup);
+
+ buildCFIForRegToSGPRPairSpill(MBB, MBBI, DL, AMDGPU::PC_REG,
+ TRI.getReturnAddressReg(MF));
+
+ BitVector IsCalleeSaved(TRI.getNumRegs());
+ const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+ for (unsigned I = 0; CSRegs[I]; ++I) {
+ IsCalleeSaved.set(CSRegs[I]);
+ }
+ auto ProcessReg = [&](MCPhysReg Reg) {
+ if (IsCalleeSaved.test(Reg) || !MRI.isPhysRegModified(Reg))
+ return;
+ MCRegister DwarfReg = MCRI->getDwarfRegNum(Reg, false);
+ buildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createUndefined(nullptr, DwarfReg));
+ };
+
+ // Emit CFI rules for caller saved Arch VGPRs which are clobbered
+ unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
+ for_each(AMDGPU::VGPR_32RegClass.getRegisters().take_front(NumArchVGPRs),
+ ProcessReg);
+
+ // Emit CFI rules for caller saved Accum VGPRs which are clobbered
+ if (ST.hasMAIInsts()) {
+ for_each(AMDGPU::AGPR_32RegClass.getRegisters(), ProcessReg);
+ }
+
+ // Emit CFI rules for caller saved SGPRs which are clobbered
+ for_each(AMDGPU::SGPR_32RegClass.getRegisters(), ProcessReg);
+}
+
// Activate only the inactive lanes when \p EnableInactiveLanes is true.
// Otherwise, activate all lanes. It returns the saved exec.
static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
@@ -1013,14 +1192,19 @@ static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
return ScratchExecCopy;
}
-void SIFrameLowering::emitCSRSpillStores(
- MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
- Register FrameReg, Register FramePtrRegScratchCopy) const {
+void SIFrameLowering::emitCSRSpillStores(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc &DL, LiveRegUnits &LiveUnits,
+ Register FrameReg,
+ Register FramePtrRegScratchCopy,
+ const bool NeedsFrameMoves) const {
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ const MCRegisterInfo *MCRI = MF.getContext().getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
@@ -1042,6 +1226,12 @@ void SIFrameLowering::emitCSRSpillStores(
int FI = Reg.second;
buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
VGPR, FI, FrameReg);
+ if (NeedsFrameMoves)
+ // We spill the entire VGPR, so we can get away with just cfi_offset
+ buildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createOffset(
+ nullptr, MCRI->getDwarfRegNum(VGPR, false),
+ MFI.getObjectOffset(FI) * ST.getWavefrontSize()));
}
};
@@ -1090,13 +1280,13 @@ void SIFrameLowering::emitCSRSpillStores(
// Skip if FP is saved to a scratch SGPR, the save has already been emitted.
// Otherwise, FP has been moved to a temporary register and spill it
// instead.
- Register Reg =
- Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
+ bool IsFramePtrPrologSpill = Spill.first == FramePtrReg ? true : false;
+ Register Reg = IsFramePtrPrologSpill ? FramePtrRegScratchCopy : Spill.first;
if (!Reg)
continue;
PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
- LiveUnits, FrameReg);
+ LiveUnits, FrameReg, IsFramePtrPrologSpill);
SB.save();
}
@@ -1264,6 +1454,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
uint32_t NumBytes = MFI.getStackSize();
uint32_t RoundedSize = NumBytes;
+ const bool NeedsFrameMoves = needsFrameMoves(MF);
+
+ if (NeedsFrameMoves)
+ emitPrologueEntryCFI(MBB, MBBI, DL);
+
if (TRI.hasStackRealignment(MF))
HasFP = true;
@@ -1272,7 +1467,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
// Emit the CSR spill stores with SP base register.
emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
FuncInfo->isChainFunction() ? Register() : StackPtrReg,
- FramePtrRegScratchCopy);
+ FramePtrRegScratchCopy, NeedsFrameMoves);
} else {
// CSR spill stores will use FP as base register.
Register SGPRForFPSaveRestoreCopy =
@@ -1286,7 +1481,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
PrologEpilogSGPRSpillBuilder SB(
FramePtrReg,
FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
- DL, TII, TRI, LiveUnits, FramePtrReg);
+ DL, TII, TRI, LiveUnits, FramePtrReg,
+ /*IsFramePtrPrologSpill*/ true);
SB.save();
LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
} else {
@@ -1333,7 +1529,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
// If FP is used, emit the CSR spills with FP base register.
if (HasFP) {
emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
- FramePtrRegScratchCopy);
+ FramePtrRegScratchCopy, NeedsFrameMoves);
if (FramePtrRegScratchCopy)
LiveUnits.removeReg(FramePtrRegScratchCopy);
}
@@ -1348,6 +1544,12 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
+ if (HasFP) {
+ if (NeedsFrameMoves)
+ emitDefCFA(MBB, MBBI, DL, FramePtrReg, /*AspaceAlreadyDefined=*/false,
+ MachineInstr::FrameSetup);
+ }
+
if (HasFP && RoundedSize != 0) {
auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
.addReg(StackPtrReg)
@@ -1447,6 +1649,13 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
FramePtrRegScratchCopy);
}
+ const bool NeedsFrameMoves = needsFrameMoves(MF);
+ if (hasFP(MF)) {
+ if (NeedsFrameMoves)
+ emitDefCFA(MBB, MBBI, DL, StackPtrReg, /*AspaceAlreadyDefined=*/false,
+ MachineInstr::FrameDestroy);
+ }
+
if (FPSaved) {
// Insert the copy to restore FP.
Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
@@ -2257,3 +2466,72 @@ MachineInstr *SIFrameLowering::buildCFI(MachineBasicBlock &MBB,
.addCFIIndex(MF.addFrameInst(CFIInst))
.setMIFlag(flag);
}
+
+MachineInstr *SIFrameLowering::buildCFIForSGPRToVGPRSpill(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, const Register SGPR, const Register VGPR,
+ const int Lane) const {
+ const MachineFunction &MF = *MBB.getParent();
+ const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
+
+ int DwarfSGPR = MCRI.getDwarfRegNum(SGPR, false);
+ int DwarfVGPR = MCRI.getDwarfRegNum(VGPR, false);
+ assert(DwarfSGPR != -1 && DwarfVGPR != -1);
+ assert(Lane != -1 && "Expected a lane to be present");
+
+ // Build a CFI instruction that represents a SGPR spilled to a single lane of
+ // a VGPR.
+ MCCFIInstruction::VectorRegisterWithLane VR{unsigned(DwarfVGPR),
+ unsigned(Lane), VGPRLaneBitSize};
+ auto CFIInst =
+ MCCFIInstruction::createLLVMVectorRegisters(nullptr, DwarfSGPR, {VR});
+ return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
+}
+
+MachineInstr *SIFrameLowering::buildCFIForSGPRToVGPRSpill(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, Register SGPR,
+ ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills) const {
+ const MachineFunction &MF = *MBB.getParent();
+ const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
+
+ int DwarfSGPR = MCRI.getDwarfRegNum(SGPR, false);
+ assert(DwarfSGPR != -1);
+
+ // Build a CFI instruction that represents a SGPR spilled to multiple lanes of
+ // multiple VGPRs.
+
+ std::vector<MCCFIInstruction::VectorRegisterWithLane> VGPRs;
+ for (SIRegisterInfo::SpilledReg Spill : VGPRSpills) {
+ int DwarfVGPR = MCRI.getDwarfRegNum(Spill.VGPR, false);
+ assert(DwarfVGPR != -1);
+ assert(Spill.hasLane() && "Expected a lane to be present");
+ VGPRs.push_back(
+ {unsigned(DwarfVGPR), unsigned(Spill.Lane), VGPRLaneBitSize});
+ }
+
+ auto CFIInst = MCCFIInstruction::createLLVMVectorRegisters(nullptr, DwarfSGPR,
+ std::move(VGPRs));
+ return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
+}
+
+MachineInstr *SIFrameLowering::buildCFIForRegToSGPRPairSpill(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, const Register Reg, const Register SGPRPair) const {
+ const MachineFunction &MF = *MBB.getParent();
+ const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
+
+ int SGPR0 = TRI.getSubReg(SGPRPair, AMDGPU::sub0);
+ int SGPR1 = TRI.getSubReg(SGPRPair, AMDGPU::sub1);
+
+ int DwarfReg = MCRI.getDwarfRegNum(Reg, false);
+ int DwarfSGPR0 = MCRI.getDwarfRegNum(SGPR0, false);
+ int DwarfSGPR1 = MCRI.getDwarfRegNum(SGPR1, false);
+ assert(DwarfReg != -1 && DwarfSGPR0 != 1 && DwarfSGPR1 != 1);
+
+ auto CFIInst = MCCFIInstruction::createLLVMRegisterPair(
+ nullptr, DwarfReg, DwarfSGPR0, SGPRBitSize, DwarfSGPR1, SGPRBitSize);
+ return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
+}
diff --git a/llvm/lib/T...
[truncated]
|
| @@ -26,6 +26,326 @@ body: | | |||
| ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) | |||
| ; GFX908-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 | |||
| ; GFX908-NEXT: {{ $}} | |||
| ; GFX908-NEXT: frame-setup CFI_INSTRUCTION llvm_def_aspace_cfa $sgpr32, 0, 6 | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Most of these test changes look like adding CFI spam to unrelated tests. Most of these should probably add nounwind instead
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is still a huge diff in MIR tests which don't have IR to add nounwind to. I still haven't found a reasonable way to apply the attribute there. If you have any other suggestions I'm open to them, but I haven't been able to.
cbbe613 to
9bd44a8
Compare
This does not implement CSR spills other than those AMDGPU handles during PEI. The remaining spills are handled in a subsequent patch. Co-authored-by: Scott Linder <[email protected]> Co-authored-by: Venkata Ramanaiah Nalamothu <[email protected]>
ee97ce9 to
e811d05
Compare
| bool IsFramePtrPrologSpill; | ||
| bool NeedsFrameMoves; | ||
|
|
||
| bool isExec(Register Reg) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| bool isExec(Register Reg) const { | |
| static bool isExec(Register Reg) { |
| if (isExec(SuperReg) && (I == NumSubRegs - 1)) | ||
| SubReg = AMDGPU::EXEC; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't follow how EXEC would be a subreg? Also this could more directly just check the subtarget's exec / wave size?
| if (NeedsFrameMoves) { | ||
| if (isExec(SuperReg)) { | ||
| if (I == NumSubRegs - 1) | ||
| TFI->buildCFIForSGPRToVGPRSpill(MBB, MI, DL, AMDGPU::EXEC, Spill); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should only be handling exec_lo for wave32?
| void SIFrameLowering::emitCSRSpillStores(MachineFunction &MF, | ||
| MachineBasicBlock &MBB, | ||
| MachineBasicBlock::iterator MBBI, | ||
| DebugLoc &DL, LiveRegUnits &LiveUnits, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| DebugLoc &DL, LiveRegUnits &LiveUnits, | |
| const DebugLoc &DL, LiveRegUnits &LiveUnits, |
| int SGPR0 = TRI.getSubReg(SGPRPair, AMDGPU::sub0); | ||
| int SGPR1 = TRI.getSubReg(SGPRPair, AMDGPU::sub1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| int SGPR0 = TRI.getSubReg(SGPRPair, AMDGPU::sub0); | |
| int SGPR1 = TRI.getSubReg(SGPRPair, AMDGPU::sub1); | |
| MCRegister SGPR0 = TRI.getSubReg(SGPRPair, AMDGPU::sub0); | |
| MCRegister SGPR1 = TRI.getSubReg(SGPRPair, AMDGPU::sub1); |

This does not implement CSR spills other than those AMDGPU handles
during PEI. The remaining spills are handled in a subsequent patch.
Co-authored-by: Scott Linder [email protected]
Co-authored-by: Venkata Ramanaiah Nalamothu [email protected]