Skip to content

Commit bf50bed

Browse files
committed
[AMDGPU] Change control flow intrinsic lowering making the wave to reconverge at the end of the predecessor block.
1 parent 82c5d35 commit bf50bed

File tree

196 files changed

+20209
-13418
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

196 files changed

+20209
-13418
lines changed

llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp

Lines changed: 15 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -306,42 +306,26 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
306306

307307
/// Close the last opened control flow
308308
bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
309-
llvm::Loop *L = LI->getLoopFor(BB);
310309

311310
assert(Stack.back().first == BB);
312311

313-
if (L && L->getHeader() == BB) {
314-
// We can't insert an EndCF call into a loop header, because it will
315-
// get executed on every iteration of the loop, when it should be
316-
// executed only once before the loop.
317-
SmallVector <BasicBlock *, 8> Latches;
318-
L->getLoopLatches(Latches);
319-
320-
SmallVector<BasicBlock *, 2> Preds;
321-
for (BasicBlock *Pred : predecessors(BB)) {
322-
if (!is_contained(Latches, Pred))
323-
Preds.push_back(Pred);
324-
}
325-
326-
BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr,
327-
false);
328-
}
329-
330312
Value *Exec = popSaved();
331-
BasicBlock::iterator FirstInsertionPt = BB->getFirstInsertionPt();
332-
if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) {
333-
Instruction *ExecDef = cast<Instruction>(Exec);
334-
BasicBlock *DefBB = ExecDef->getParent();
335-
if (!DT->dominates(DefBB, BB)) {
336-
// Split edge to make Def dominate Use
337-
FirstInsertionPt = SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
313+
Instruction *ExecDef = dyn_cast<Instruction>(Exec);
314+
BasicBlock *DefBB = ExecDef->getParent();
315+
for (auto Pred : predecessors(BB)) {
316+
llvm::Loop *L = LI->getLoopFor(Pred);
317+
bool IsLoopLatch = false;
318+
if (L) {
319+
SmallVector<BasicBlock *, 4> LL;
320+
L->getLoopLatches(LL);
321+
IsLoopLatch = std::find_if(LL.begin(), LL.end(), [Pred](BasicBlock *B) {
322+
return B == Pred;
323+
}) != LL.end();
324+
}
325+
if (Pred != DefBB && DT->dominates(DefBB, Pred) && !IsLoopLatch) {
326+
BasicBlock::iterator InsPt(Pred->getTerminator());
327+
IRBuilder<>(Pred, InsPt).CreateCall(EndCf, {Exec});
338328
}
339-
IRBuilder<> IRB(FirstInsertionPt->getParent(), FirstInsertionPt);
340-
// TODO: StructurizeCFG 'Flow' blocks have debug locations from the
341-
// condition, for now just avoid copying these DebugLocs so that stepping
342-
// out of the then/else block in a debugger doesn't step to the condition.
343-
IRB.SetCurrentDebugLocation(DebugLoc());
344-
IRB.CreateCall(EndCf, {Exec});
345329
}
346330

347331
return true;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15740,6 +15740,91 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
1574015740
}
1574115741
}
1574215742

15743+
// ISel inserts copy to regs for the successor PHIs
15744+
// at the BB end. We need to move the SI_END_CF right before the branch.
15745+
// Even we don't have to move SI_END_CF we need to take care of the
15746+
// S_CBRANCH_SCC0/1 as SI_END_CF overwrites SCC
15747+
for (auto &MBB : MF) {
15748+
for (auto &MI : MBB) {
15749+
if (MI.getOpcode() == AMDGPU::SI_END_CF) {
15750+
MachineBasicBlock::iterator I(MI);
15751+
MachineBasicBlock::iterator Next = std::next(I);
15752+
bool NeedToMove = false;
15753+
while (Next != MBB.end() && !Next->isBranch()) {
15754+
NeedToMove = true;
15755+
Next++;
15756+
}
15757+
15758+
// Lets take care of SCC users as S_END_CF defines SCC
15759+
bool NeedPreserveSCC =
15760+
Next != MBB.end() && Next->readsRegister(AMDGPU::SCC);
15761+
MachineBasicBlock::iterator SCCDefUse(Next);
15762+
// This loop will be never taken as we always have S_CBRANCH_SCC1/0 at
15763+
// the end of the block.
15764+
while (!NeedPreserveSCC && SCCDefUse != MBB.end()) {
15765+
if (SCCDefUse->definesRegister(AMDGPU::SCC))
15766+
// This should never happen - SCC def after the branch reading SCC
15767+
break;
15768+
if (SCCDefUse->readsRegister(AMDGPU::SCC)) {
15769+
NeedPreserveSCC = true;
15770+
break;
15771+
}
15772+
SCCDefUse++;
15773+
}
15774+
if (NeedPreserveSCC) {
15775+
MachineBasicBlock::reverse_iterator BackSeeker(Next);
15776+
while (BackSeeker != MBB.rend()) {
15777+
if (BackSeeker != MI && BackSeeker->definesRegister(AMDGPU::SCC))
15778+
break;
15779+
BackSeeker++;
15780+
}
15781+
// we need this to makes some artificial MIR tests happy
15782+
bool NeedSetSCCUndef = false;
15783+
if (BackSeeker == MBB.rend()) {
15784+
// We have reached the begin of the block but haven't seen the SCC
15785+
// def Given that the MIR is correct, we either have SCC live in
15786+
// or SCCUser SCC operand is undef. In fact, we don't need to emit
15787+
// the instructions that preserve thje SCC if the use is Undef. We
15788+
// do this just because the MIR looks weird otherwise.
15789+
MachineOperand *SCCUseOp =
15790+
SCCDefUse->findRegisterUseOperand(AMDGPU::SCC, false, TRI);
15791+
assert(SCCUseOp);
15792+
bool IsSCCLiveIn = MBB.isLiveIn(AMDGPU::SCC);
15793+
bool IsUseUndef = SCCUseOp->isUndef();
15794+
NeedSetSCCUndef = (!IsSCCLiveIn && IsUseUndef);
15795+
}
15796+
MachineBasicBlock::iterator InsPt(BackSeeker);
15797+
Register SavedSCC =
15798+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
15799+
MachineInstr *SaveSCC =
15800+
BuildMI(MBB, InsPt, InsPt->getDebugLoc(),
15801+
TII->get(AMDGPU::S_CSELECT_B32), SavedSCC)
15802+
.addImm(1)
15803+
.addImm(0);
15804+
if (NeedSetSCCUndef) {
15805+
15806+
MachineOperand *SCCOp =
15807+
SaveSCC->findRegisterUseOperand(AMDGPU::SCC, false, TRI);
15808+
if (SCCOp)
15809+
SCCOp->setIsUndef();
15810+
}
15811+
Register Tmp =
15812+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
15813+
Next = BuildMI(MBB, Next, Next->getDebugLoc(),
15814+
TII->get(AMDGPU::S_AND_B32_term), Tmp)
15815+
.addReg(SavedSCC)
15816+
.addImm(1);
15817+
}
15818+
15819+
if (NeedToMove) {
15820+
MBB.splice(Next, &MBB, &MI);
15821+
}
15822+
15823+
break;
15824+
}
15825+
}
15826+
}
15827+
1574315828
// FIXME: This is a hack to fixup AGPR classes to use the properly aligned
1574415829
// classes if required. Ideally the register class constraints would differ
1574515830
// per-subtarget, but there's no easy way to achieve that right now. This is

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3102,6 +3102,7 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
31023102
break;
31033103
case AMDGPU::SI_IF:
31043104
case AMDGPU::SI_ELSE:
3105+
case AMDGPU::SI_END_CF:
31053106
case AMDGPU::SI_KILL_I1_TERMINATOR:
31063107
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
31073108
// FIXME: It's messy that these need to be considered here at all.

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -475,8 +475,6 @@ def SI_LOOP : CFPseudoInstSI <
475475
let IsNeverUniform = 1;
476476
}
477477

478-
} // End isTerminator = 1
479-
480478
def SI_END_CF : CFPseudoInstSI <
481479
(outs), (ins SReg_1:$saved), [], 1, 1> {
482480
let Size = 4;
@@ -488,6 +486,8 @@ def SI_END_CF : CFPseudoInstSI <
488486
let mayStore = 1;
489487
}
490488

489+
} // End isTerminator = 1
490+
491491
def SI_IF_BREAK : CFPseudoInstSI <
492492
(outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
493493
let Size = 4;

0 commit comments

Comments
 (0)