-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU] Support block load/store for CSR #130013
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
1ff7e72
0e931ed
bfe16b2
7d332f2
eba6328
88757de
e16d2f9
7573cb8
f64ec4d
5297f03
e982f21
a215dab
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| #include "AMDGPUMachineFunction.h" | ||
| #include "MCTargetDesc/AMDGPUInstPrinter.h" | ||
| #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | ||
| #include "SIMachineFunctionInfo.h" | ||
| #include "llvm/CodeGen/MachineBasicBlock.h" | ||
| #include "llvm/CodeGen/MachineInstr.h" | ||
| #include "llvm/IR/Constants.h" | ||
|
|
@@ -239,6 +240,36 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) { | |
| return AsmPrinter::lowerConstant(CV); | ||
| } | ||
|
|
||
| static void emitVGPRBlockComment(const MachineInstr *MI, MCStreamer &OS) { | ||
| // The instruction will only transfer a subset of the registers in the block, | ||
| // based on the mask that is stored in m0. We could search for the instruction | ||
| // that sets m0, but most of the time we'll already have the mask stored in | ||
| // the machine function info. Try to use that. This assumes that we only use | ||
| // block loads/stores for CSR spills. | ||
| const MachineFunction *MF = MI->getParent()->getParent(); | ||
| const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); | ||
| const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); | ||
| const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo(); | ||
rovka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| Register RegBlock = | ||
| TII->getNamedOperand(*MI, MI->mayLoad() ? AMDGPU::OpName::vdst | ||
| : AMDGPU::OpName::vdata) | ||
| ->getReg(); | ||
| Register FirstRegInBlock = TRI.getSubReg(RegBlock, AMDGPU::sub0); | ||
| uint32_t Mask = MFI->getMaskForVGPRBlockOps(RegBlock); | ||
|
|
||
| SmallString<512> TransferredRegs; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can avoid the temporary SmallString, emitRawComment uses Twine anyway
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wasn't Twine only meant to be used in function arguments? Or did you mean to emit one comment for each transferred register? |
||
| for (unsigned I = 0; I < 32; ++I) { | ||
rovka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if (Mask & (1 << I)) { | ||
| (llvm::Twine(" ") + TRI.getName(FirstRegInBlock + I)) | ||
rovka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| .toVector(TransferredRegs); | ||
| } | ||
| } | ||
|
|
||
| if (!TransferredRegs.empty()) | ||
rovka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| OS.emitRawComment(" transferring at most " + TransferredRegs); | ||
| } | ||
|
|
||
| void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { | ||
| // FIXME: Enable feature predicate checks once all the test pass. | ||
| // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(), | ||
|
|
@@ -327,6 +358,10 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { | |
| return; | ||
| } | ||
|
|
||
| if (STI.getInstrInfo()->isBlockLoadStore(MI->getOpcode())) | ||
| if (isVerbose()) | ||
rovka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| emitVGPRBlockComment(MI, *OutStreamer); | ||
|
|
||
| MCInst TmpInst; | ||
| MCInstLowering.lower(MI, TmpInst); | ||
| EmitToStreamer(*OutStreamer, TmpInst); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1694,6 +1694,109 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, | |
| } | ||
| } | ||
|
|
||
| static void assignSlotsUsingVGPRBlocks(MachineFunction &MF, | ||
| const GCNSubtarget &ST, | ||
| const TargetRegisterInfo *TRI, | ||
rovka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| std::vector<CalleeSavedInfo> &CSI, | ||
| unsigned &MinCSFrameIndex, | ||
| unsigned &MaxCSFrameIndex) { | ||
| SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); | ||
| MachineFrameInfo &MFI = MF.getFrameInfo(); | ||
| const SIInstrInfo *TII = ST.getInstrInfo(); | ||
| const SIRegisterInfo *MRI = ST.getRegisterInfo(); | ||
|
|
||
| assert(std::is_sorted(CSI.begin(), CSI.end(), | ||
| [](const CalleeSavedInfo &A, const CalleeSavedInfo &B) { | ||
| return A.getReg() < B.getReg(); | ||
| }) && | ||
| "Callee saved registers not sorted"); | ||
|
|
||
| auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) { | ||
| return !CSI.isSpilledToReg() && | ||
| MRI->isVGPR(MF.getRegInfo(), CSI.getReg()) && | ||
| !FuncInfo->isWWMReservedRegister(CSI.getReg()); | ||
| }; | ||
|
|
||
| auto CSEnd = CSI.end(); | ||
| for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) { | ||
| Register Reg = CSIt->getReg(); | ||
| if (!CanUseBlockOps(*CSIt)) | ||
| continue; | ||
|
|
||
| // Find all the regs that will fit in a 32-bit mask starting at the current | ||
| // reg and build said mask. It should have 1 for every register that's | ||
| // included, with the current register as the least significant bit. | ||
| uint32_t Mask = 1; | ||
| CSEnd = std::remove_if( | ||
| CSIt + 1, CSEnd, [&](const CalleeSavedInfo &CSI) -> bool { | ||
| if (CanUseBlockOps(CSI) && CSI.getReg() < Reg + 32) { | ||
| Mask |= 1 << (CSI.getReg() - Reg); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems a little odd to build the Mask through the predicate, but I guess it will be deterministic even if for example the predicate function was called multiple times on the same element. |
||
| return true; | ||
| } else { | ||
| return false; | ||
| } | ||
| }); | ||
|
|
||
| const TargetRegisterClass *BlockRegClass = &AMDGPU::VReg_1024RegClass; | ||
| Register RegBlock = | ||
| MRI->getMatchingSuperReg(Reg, AMDGPU::sub0, BlockRegClass); | ||
| if (!RegBlock) { | ||
| // We couldn't find a super register for the block. This can happen if | ||
| // the register we started with is too high (e.g. v232 if the maximum is | ||
| // v255). We therefore try to get the last register block and figure out | ||
| // the mask from there. | ||
| Register LastBlockStart = | ||
| AMDGPU::VGPR0 + alignDown(Reg - AMDGPU::VGPR0, 32); | ||
| RegBlock = | ||
| MRI->getMatchingSuperReg(LastBlockStart, AMDGPU::sub0, BlockRegClass); | ||
| assert(RegBlock && MRI->isSubRegister(RegBlock, Reg) && | ||
| "Couldn't find super register"); | ||
| int RegDelta = Reg - LastBlockStart; | ||
| assert(RegDelta > 0 && llvm::countl_zero(Mask) >= RegDelta && | ||
| "Bad shift amount"); | ||
| Mask <<= RegDelta; | ||
| } | ||
|
|
||
| FuncInfo->setMaskForVGPRBlockOps(RegBlock, Mask); | ||
|
|
||
| // The stack objects can be a bit smaller than the register block if we know | ||
| // some of the high bits of Mask are 0. This may happen often with calling | ||
| // conventions where the caller and callee-saved VGPRs are interleaved at | ||
| // a small boundary (e.g. 8 or 16). | ||
| int UnusedBits = llvm::countl_zero(Mask); | ||
| unsigned BlockSize = MRI->getSpillSize(*BlockRegClass) - UnusedBits * 4; | ||
| int FrameIdx = | ||
| MFI.CreateStackObject(BlockSize, MRI->getSpillAlign(*BlockRegClass), | ||
| /*isSpillSlot=*/true); | ||
| if ((unsigned)FrameIdx < MinCSFrameIndex) | ||
| MinCSFrameIndex = FrameIdx; | ||
| if ((unsigned)FrameIdx > MaxCSFrameIndex) | ||
| MaxCSFrameIndex = FrameIdx; | ||
|
|
||
| CSIt->setFrameIdx(FrameIdx); | ||
| CSIt->setReg(RegBlock); | ||
| CSIt->setHandledByTarget(); | ||
| } | ||
| CSI.erase(CSEnd, CSI.end()); | ||
| } | ||
|
|
||
| bool SIFrameLowering::assignCalleeSavedSpillSlots( | ||
| MachineFunction &MF, const TargetRegisterInfo *TRI, | ||
| std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex, | ||
| unsigned &MaxCSFrameIndex) const { | ||
| if (CSI.empty()) | ||
| return true; // Early exit if no callee saved registers are modified! | ||
|
|
||
| const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||
| bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR(); | ||
|
|
||
| if (UseVGPRBlocks) | ||
| assignSlotsUsingVGPRBlocks(MF, ST, TRI, CSI, MinCSFrameIndex, | ||
| MaxCSFrameIndex); | ||
|
|
||
| return assignCalleeSavedSpillSlots(MF, TRI, CSI); | ||
| } | ||
|
|
||
| bool SIFrameLowering::assignCalleeSavedSpillSlots( | ||
| MachineFunction &MF, const TargetRegisterInfo *TRI, | ||
| std::vector<CalleeSavedInfo> &CSI) const { | ||
|
|
@@ -1763,6 +1866,101 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( | |
| return true; | ||
| } | ||
|
|
||
| bool SIFrameLowering::spillCalleeSavedRegisters( | ||
| MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, | ||
| ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { | ||
| MachineFunction *MF = MBB.getParent(); | ||
| const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); | ||
| if (!ST.useVGPRBlockOpsForCSR()) | ||
| return false; | ||
|
|
||
| MachineFrameInfo &FrameInfo = MF->getFrameInfo(); | ||
| SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); | ||
| const SIInstrInfo *TII = ST.getInstrInfo(); | ||
| SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); | ||
|
|
||
| for (const CalleeSavedInfo &CS : CSI) { | ||
| Register Reg = CS.getReg(); | ||
| if (!CS.isHandledByTarget()) | ||
| continue; | ||
|
|
||
| // Build a scratch block store. | ||
| uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg); | ||
| int FrameIndex = CS.getFrameIdx(); | ||
| MachinePointerInfo PtrInfo = | ||
| MachinePointerInfo::getFixedStack(*MF, FrameIndex); | ||
| MachineMemOperand *MMO = | ||
| MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, | ||
| FrameInfo.getObjectSize(FrameIndex), | ||
| FrameInfo.getObjectAlign(FrameIndex)); | ||
|
|
||
| BuildMI(MBB, MI, MI->getDebugLoc(), | ||
| TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_SAVE)) | ||
| .addReg(Reg, getKillRegState(false)) | ||
| .addFrameIndex(FrameIndex) | ||
| .addReg(MFI->getStackPtrOffsetReg()) | ||
| .addImm(0) | ||
| .addImm(Mask) | ||
| .addMemOperand(MMO); | ||
|
|
||
| FuncInfo->setHasSpilledVGPRs(); | ||
|
|
||
| // Add the register to the liveins. This is necessary because if any of the | ||
| // VGPRs in the register block is reserved (e.g. if it's a WWM register), | ||
| // then the whole block will be marked as reserved and `updateLiveness` will | ||
| // skip it. | ||
| MBB.addLiveIn(Reg); | ||
rovka marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| return false; | ||
| } | ||
|
|
||
| bool SIFrameLowering::restoreCalleeSavedRegisters( | ||
| MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, | ||
| MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { | ||
| MachineFunction *MF = MBB.getParent(); | ||
| const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); | ||
| if (!ST.useVGPRBlockOpsForCSR()) | ||
| return false; | ||
|
|
||
| SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>(); | ||
| MachineFrameInfo &MFI = MF->getFrameInfo(); | ||
| const SIInstrInfo *TII = ST.getInstrInfo(); | ||
| const SIRegisterInfo *SITRI = static_cast<const SIRegisterInfo *>(TRI); | ||
| for (const CalleeSavedInfo &CS : reverse(CSI)) { | ||
| if (!CS.isHandledByTarget()) | ||
| continue; | ||
|
|
||
| // Build a scratch block load. | ||
| Register Reg = CS.getReg(); | ||
| uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg); | ||
| int FrameIndex = CS.getFrameIdx(); | ||
| MachinePointerInfo PtrInfo = | ||
| MachinePointerInfo::getFixedStack(*MF, FrameIndex); | ||
| MachineMemOperand *MMO = MF->getMachineMemOperand( | ||
| PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex), | ||
| MFI.getObjectAlign(FrameIndex)); | ||
|
|
||
| auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(), | ||
| TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), Reg) | ||
| .addFrameIndex(FrameIndex) | ||
| .addReg(FuncInfo->getStackPtrOffsetReg()) | ||
| .addImm(0) | ||
| .addImm(Mask) | ||
| .addMemOperand(MMO); | ||
| SITRI->addImplicitUsesForBlockCSRLoad(MIB, Reg); | ||
|
|
||
| // Add the register to the liveins. This is necessary because if any of the | ||
| // VGPRs in the register block is reserved (e.g. if it's a WWM register), | ||
| // then the whole block will be marked as reserved and `updateLiveness` will | ||
| // skip it. | ||
| if (!MBB.isLiveIn(Reg)) | ||
| MBB.addLiveIn(Reg); | ||
rovka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| return false; | ||
| } | ||
|
|
||
| MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( | ||
| MachineFunction &MF, | ||
| MachineBasicBlock &MBB, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.