-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU] Generate waterfall for calls with SGPR(inreg) argument #146997
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
024cb4f
a2ae8c8
6709725
31bbc2b
502cb2c
7434975
d8befc9
c744d82
e3c1295
8bd9a66
79f8dd5
d0c6ecd
6af7b97
8543612
ad6a65d
ab0476f
f7a7ea1
5815b3c
fd01203
9e73fab
54cf1e6
3f5cab9
50f01fd
015026d
41cd451
892cc6a
0a1318f
2a810be
a43e0bd
8aeb5bf
46237f6
53542ec
8391ff3
8d43378
3ac45a6
33789b5
d781e89
65fafdf
88a560e
eae2546
18451b5
52d20aa
c79e72f
e52a453
48648ec
2ebcae2
2776712
e14c0ab
80d86f6
5a7616c
8d9ad2b
97659d2
a4d1622
9fb15f0
f3842c8
23ee51e
12b1c4a
77da185
d02384b
ee9fa7b
2c40e2c
3721653
1133555
1340574
ca7a4b3
2b3eef0
2742945
9e95c99
7f51b4a
fcc357d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -924,12 +924,18 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, | |
|
|
||
| if (!MRI->constrainRegClass(SrcReg, ConstrainRC)) | ||
| llvm_unreachable("failed to constrain register"); | ||
| } else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), | ||
| MI, MI.getDebugLoc())) { | ||
| return true; | ||
| } | ||
|
|
||
| if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), MI, | ||
| MI.getDebugLoc())) { | ||
| I = std::next(I); | ||
| MI.eraseFromParent(); | ||
| return true; | ||
| } | ||
| return true; | ||
|
|
||
| if (!SrcReg.isVirtual()) | ||
| return true; | ||
|
Comment on lines
+937
to
+938
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand this early exit, the !SrcReg.isVirtual() has existing explicit handling just below here
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @arsenm , the upcoming |
||
| } | ||
| if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { | ||
| SIInstrWorklist worklist; | ||
|
|
@@ -955,7 +961,7 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { | |
| if (PHISources.contains(MI)) | ||
| return; | ||
| Register DstReg = MI->getOperand(0).getReg(); | ||
| const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); | ||
| const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, DstReg); | ||
|
|
||
| V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, | ||
| TRI->getRegSizeInBits(*DstRC)); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6922,13 +6922,10 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, | |
| // Emit the actual waterfall loop, executing the wrapped instruction for each | ||
| // unique value of \p ScalarOps across all lanes. In the best case we execute 1 | ||
| // iteration, in the worst case we execute 64 (once per lane). | ||
| static void | ||
| emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, | ||
| MachineRegisterInfo &MRI, | ||
| MachineBasicBlock &LoopBB, | ||
| MachineBasicBlock &BodyBB, | ||
| const DebugLoc &DL, | ||
| ArrayRef<MachineOperand *> ScalarOps) { | ||
| static void emitLoadScalarOpsFromVGPRLoop( | ||
| const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, | ||
| MachineBasicBlock &BodyBB, const DebugLoc &DL, | ||
| ArrayRef<MachineOperand *> ScalarOps, ArrayRef<Register> PhySGPRs = {}) { | ||
| MachineFunction &MF = *LoopBB.getParent(); | ||
| const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||
| const SIRegisterInfo *TRI = ST.getRegisterInfo(); | ||
|
|
@@ -6937,8 +6934,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, | |
|
|
||
| MachineBasicBlock::iterator I = LoopBB.begin(); | ||
| Register CondReg; | ||
|
|
||
| for (MachineOperand *ScalarOp : ScalarOps) { | ||
| for (auto [Idx, ScalarOp] : enumerate(ScalarOps)) { | ||
| unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI); | ||
| unsigned NumSubRegs = RegSize / 32; | ||
| Register VScalarOp = ScalarOp->getReg(); | ||
|
|
@@ -6967,7 +6963,15 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, | |
| } | ||
|
|
||
| // Update ScalarOp operand to use the SGPR ScalarOp. | ||
| ScalarOp->setReg(CurReg); | ||
| if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid()) | ||
| ScalarOp->setReg(CurReg); | ||
| else { | ||
| // Insert into the same block of use | ||
| BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL, | ||
| TII.get(AMDGPU::COPY), PhySGPRs[Idx]) | ||
| .addReg(CurReg); | ||
| ScalarOp->setReg(PhySGPRs[Idx]); | ||
| } | ||
| ScalarOp->setIsKill(); | ||
| } else { | ||
| SmallVector<Register, 8> ReadlanePieces; | ||
|
|
@@ -7036,7 +7040,14 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, | |
| } | ||
|
|
||
| // Update ScalarOp operand to use the SGPR ScalarOp. | ||
| ScalarOp->setReg(SScalarOp); | ||
| if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid()) | ||
| ScalarOp->setReg(SScalarOp); | ||
| else { | ||
| BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL, | ||
| TII.get(AMDGPU::COPY), PhySGPRs[Idx]) | ||
| .addReg(SScalarOp); | ||
| ScalarOp->setReg(PhySGPRs[Idx]); | ||
| } | ||
| ScalarOp->setIsKill(); | ||
| } | ||
| } | ||
|
|
@@ -7063,11 +7074,14 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, | |
| // with SGPRs by iterating over all unique values across all lanes. | ||
| // Returns the loop basic block that now contains \p MI. | ||
| static MachineBasicBlock * | ||
| loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, | ||
| ArrayRef<MachineOperand *> ScalarOps, | ||
| MachineDominatorTree *MDT, | ||
| MachineBasicBlock::iterator Begin = nullptr, | ||
| MachineBasicBlock::iterator End = nullptr) { | ||
| generateWaterFallLoop(const SIInstrInfo &TII, MachineInstr &MI, | ||
| ArrayRef<MachineOperand *> ScalarOps, | ||
| MachineDominatorTree *MDT, | ||
| MachineBasicBlock::iterator Begin = nullptr, | ||
| MachineBasicBlock::iterator End = nullptr, | ||
| ArrayRef<Register> PhySGPRs = {}) { | ||
| assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) && | ||
| "Physical SGPRs must be empty or match the number of scalar operands"); | ||
| MachineBasicBlock &MBB = *MI.getParent(); | ||
| MachineFunction &MF = *MBB.getParent(); | ||
| const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||
|
|
@@ -7152,7 +7166,8 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, | |
| } | ||
| } | ||
|
|
||
| emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps); | ||
| emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps, | ||
| PhySGPRs); | ||
|
||
|
|
||
| MachineBasicBlock::iterator First = RemainderBB->begin(); | ||
| // Restore SCC | ||
|
|
@@ -7374,13 +7389,13 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, | |
| : AMDGPU::OpName::srsrc; | ||
| MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName); | ||
| if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) | ||
| CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT); | ||
| CreatedBB = generateWaterFallLoop(*this, MI, {SRsrc}, MDT); | ||
|
|
||
| AMDGPU::OpName SampOpName = | ||
| isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp; | ||
| MachineOperand *SSamp = getNamedOperand(MI, SampOpName); | ||
| if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) | ||
| CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT); | ||
| CreatedBB = generateWaterFallLoop(*this, MI, {SSamp}, MDT); | ||
|
|
||
| return CreatedBB; | ||
| } | ||
|
|
@@ -7389,27 +7404,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, | |
| if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { | ||
| MachineOperand *Dest = &MI.getOperand(0); | ||
| if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) { | ||
| // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and | ||
| // following copies, we also need to move copies from and to physical | ||
| // registers into the loop block. | ||
| unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); | ||
| unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); | ||
|
|
||
| // Also move the copies to physical registers into the loop block | ||
| MachineBasicBlock &MBB = *MI.getParent(); | ||
| MachineBasicBlock::iterator Start(&MI); | ||
| while (Start->getOpcode() != FrameSetupOpcode) | ||
| --Start; | ||
| MachineBasicBlock::iterator End(&MI); | ||
| while (End->getOpcode() != FrameDestroyOpcode) | ||
| ++End; | ||
| // Also include following copies of the return value | ||
| ++End; | ||
| while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && | ||
| MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr)) | ||
| ++End; | ||
| CreatedBB = | ||
| loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End); | ||
| createWaterFallForSiCall(&MI, MDT, {Dest}); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -7591,19 +7586,18 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, | |
| // Legalize a VGPR Rsrc and soffset together. | ||
| if (!isSoffsetLegal) { | ||
| MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset); | ||
| CreatedBB = | ||
| loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT); | ||
| CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc, Soffset}, MDT); | ||
| return CreatedBB; | ||
| } | ||
| CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT); | ||
| CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc}, MDT); | ||
| return CreatedBB; | ||
| } | ||
| } | ||
|
|
||
| // Legalize a VGPR soffset. | ||
| if (!isSoffsetLegal) { | ||
| MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset); | ||
| CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT); | ||
| CreatedBB = generateWaterFallLoop(*this, MI, {Soffset}, MDT); | ||
| return CreatedBB; | ||
| } | ||
| return CreatedBB; | ||
|
|
@@ -7672,6 +7666,33 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, | |
| legalizeOperandsVALUt16(MI, OpIdx, MRI); | ||
| } | ||
|
|
||
| void SIInstrInfo::createWaterFallForSiCall(MachineInstr *MI, | ||
| MachineDominatorTree *MDT, | ||
| ArrayRef<MachineOperand *> ScalarOps, | ||
| ArrayRef<Register> PhySGPRs) const { | ||
| assert(MI->getOpcode() == AMDGPU::SI_CALL_ISEL && | ||
| "This only handle waterfall for SI_CALL_ISEL"); | ||
| // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and | ||
| // following copies, we also need to move copies from and to physical | ||
| // registers into the loop block. | ||
| // Also move the copies to physical registers into the loop block | ||
| MachineBasicBlock &MBB = *MI->getParent(); | ||
| MachineBasicBlock::iterator Start(MI); | ||
| while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP) | ||
| --Start; | ||
| MachineBasicBlock::iterator End(MI); | ||
| while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN) | ||
| ++End; | ||
|
|
||
| // Also include following copies of the return value | ||
| ++End; | ||
| while (End != MBB.end() && End->isCopy() && | ||
| MI->definesRegister(End->getOperand(1).getReg(), &RI)) | ||
| ++End; | ||
|
|
||
| generateWaterFallLoop(*this, *MI, ScalarOps, MDT, Start, End, PhySGPRs); | ||
| } | ||
|
|
||
| void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist, | ||
| MachineDominatorTree *MDT) const { | ||
|
|
||
|
|
@@ -7691,6 +7712,90 @@ void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist, | |
| assert(Worklist.empty() && | ||
| "Deferred MachineInstr are not supposed to re-populate worklist"); | ||
| } | ||
|
|
||
| for (std::pair<MachineInstr *, V2PhysSCopyInfo> &Entry : Worklist.WaterFalls) | ||
| if (Entry.first->getOpcode() == AMDGPU::SI_CALL_ISEL) | ||
| createWaterFallForSiCall(Entry.first, MDT, Entry.second.MOs, | ||
| Entry.second.SGPRs); | ||
|
|
||
| for (std::pair<MachineInstr *, bool> Entry : Worklist.V2SPhyCopiesToErase) | ||
| if (Entry.second) | ||
Shoreshen marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Entry.first->eraseFromParent(); | ||
| } | ||
| void SIInstrInfo::createReadFirstLaneFromCopyToPhysReg( | ||
| MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const { | ||
| // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and | ||
| // hope for the best. | ||
| const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI, DstReg); | ||
| ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(DstRC, 4); | ||
| if (BaseIndices.empty() || BaseIndices.size() == 1) { | ||
| Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | ||
| BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), | ||
| get(AMDGPU::V_READFIRSTLANE_B32), NewDst) | ||
| .add(Inst.getOperand(1)); | ||
| BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), | ||
| DstReg) | ||
| .addReg(NewDst); | ||
| } else { | ||
| SmallVector<Register, 8> DstRegs; | ||
| for (unsigned i = 0; i < BaseIndices.size(); ++i) { | ||
| Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | ||
| BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), | ||
| get(AMDGPU::V_READFIRSTLANE_B32), NewDst) | ||
| .addReg(Inst.getOperand(1).getReg(), 0, BaseIndices[i]); | ||
|
|
||
| DstRegs.push_back(NewDst); | ||
| } | ||
| MachineInstrBuilder MIB = | ||
| BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), | ||
| get(AMDGPU::REG_SEQUENCE), DstReg); | ||
| for (unsigned i = 0; i < BaseIndices.size(); ++i) { | ||
| MIB.addReg(DstRegs[i]); | ||
| MIB.addImm(RI.getSubRegFromChannel(i)); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void SIInstrInfo::handleCopyToPhysHelper(SIInstrWorklist &Worklist, | ||
| Register DstReg, MachineInstr &Inst, | ||
| MachineRegisterInfo &MRI) const { | ||
| if (DstReg == AMDGPU::M0) { | ||
| createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst); | ||
| Worklist.V2SPhyCopiesToErase.try_emplace(&Inst, true); | ||
| return; | ||
| } | ||
| Register SrcReg = Inst.getOperand(1).getReg(); | ||
| MachineBasicBlock::iterator I = Inst.getIterator(); | ||
| MachineBasicBlock::iterator E = Inst.getParent()->end(); | ||
| // Only search current block since phyreg's def & use cannot cross | ||
| // blocks when MF.NoPhi = false. | ||
| while (++I != E) { | ||
| // Currently, we only support waterfall on SI_CALL_ISEL. | ||
| if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) { | ||
| MachineInstr *UseMI = &*I; | ||
| for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) { | ||
| if (UseMI->getOperand(i).isReg() && | ||
| UseMI->getOperand(i).getReg() == DstReg) { | ||
| MachineOperand *MO = &UseMI->getOperand(i); | ||
| MO->setReg(SrcReg); | ||
| V2PhysSCopyInfo &V2SCopyInfo = Worklist.WaterFalls[UseMI]; | ||
| V2SCopyInfo.MOs.push_back(MO); | ||
| V2SCopyInfo.SGPRs.push_back(DstReg); | ||
| Worklist.V2SPhyCopiesToErase.try_emplace(&Inst, true); | ||
| } | ||
| } | ||
| } else if (I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG && | ||
| I->getOperand(0).isReg() && | ||
| I->getOperand(0).getReg() == DstReg) { | ||
| createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst); | ||
| Worklist.V2SPhyCopiesToErase.try_emplace(&Inst, true); | ||
| } else if (I->readsRegister(DstReg, &RI)) { | ||
| // COPY cannot be erased if other type of inst uses it. | ||
| Worklist.V2SPhyCopiesToErase[&Inst] = false; | ||
| } | ||
| if (I->findRegisterDefOperand(DstReg, &RI)) | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, | ||
|
|
@@ -8194,19 +8299,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, | |
| Register DstReg = Inst.getOperand(0).getReg(); | ||
| const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); | ||
|
|
||
| // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and | ||
| // hope for the best. | ||
| if (Inst.isCopy() && DstReg.isPhysical() && | ||
| RI.isVGPR(MRI, Inst.getOperand(1).getReg())) { | ||
| Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); | ||
| BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), | ||
| get(AMDGPU::V_READFIRSTLANE_B32), NewDst) | ||
| .add(Inst.getOperand(1)); | ||
| BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), | ||
| DstReg) | ||
| .addReg(NewDst); | ||
|
|
||
| Inst.eraseFromParent(); | ||
| Inst.getOperand(1).getReg().isVirtual()) { | ||
| handleCopyToPhysHelper(Worklist, DstReg, Inst, MRI); | ||
| return; | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.