Skip to content

Commit 6b2a6ab

Browse files
committed
[AMDGPU] Change control flow intrinsic lowering making the wave to reconverge at the end of the predecessor block. si_end_cf intrinsic and opcode changed to si_wave_reconverge. Restoring the exec mask on Else fix reverted and changed to another approach: placing the si_wave_reconverge in any predecessors of the Stack top ion SIAnnotateControlFlow.
1 parent 78e9b5d commit 6b2a6ab

File tree

92 files changed

+827
-844
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

92 files changed

+827
-844
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3172,8 +3172,8 @@ def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
31723172
[llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
31733173
>;
31743174

3175-
def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
3176-
[IntrWillReturn, IntrNoCallback, IntrNoFree]>;
3175+
def int_amdgcn_wave_reconverge : Intrinsic<[], [llvm_anyint_ty],
3176+
[IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
31773177

31783178
// Represent unreachable in a divergent region.
31793179
def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1557,7 +1557,7 @@ bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
15571557
// FIXME: Manually selecting to avoid dealing with the SReg_1 trick
15581558
// SelectionDAG uses for wave32 vs wave64.
15591559
MachineBasicBlock *BB = MI.getParent();
1560-
BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1560+
BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_WAVE_RECONVERGE))
15611561
.add(MI.getOperand(1));
15621562

15631563
Register Reg = MI.getOperand(1).getReg();
@@ -2083,7 +2083,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
20832083
MachineInstr &I) const {
20842084
unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
20852085
switch (IntrinsicID) {
2086-
case Intrinsic::amdgcn_end_cf:
2086+
case Intrinsic::amdgcn_wave_reconverge:
20872087
return selectEndCfIntrinsic(I);
20882088
case Intrinsic::amdgcn_ds_ordered_add:
20892089
case Intrinsic::amdgcn_ds_ordered_swap:

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4954,7 +4954,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49544954
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
49554955
break;
49564956
}
4957-
case Intrinsic::amdgcn_end_cf: {
4957+
case Intrinsic::amdgcn_wave_reconverge: {
49584958
unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
49594959
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
49604960
break;

llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp

Lines changed: 22 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class SIAnnotateControlFlow : public FunctionPass {
5353
Function *Else;
5454
Function *IfBreak;
5555
Function *Loop;
56-
Function *EndCf;
56+
Function *WaveReconverge;
5757

5858
DominatorTree *DT;
5959
StackVector Stack;
@@ -86,7 +86,7 @@ class SIAnnotateControlFlow : public FunctionPass {
8686

8787
bool handleLoop(BranchInst *Term);
8888

89-
bool closeControlFlow(BasicBlock *BB);
89+
bool insertWaveReconverge(BasicBlock *BB);
9090

9191
public:
9292
static char ID;
@@ -141,7 +141,7 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
141141
IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
142142
{ IntMask });
143143
Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
144-
EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
144+
WaveReconverge = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_wave_reconverge, { IntMask });
145145
}
146146

147147
/// Is the branch condition uniform or did the StructurizeCFG pass
@@ -305,28 +305,20 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
305305
}
306306

307307
/// Close the last opened control flow
308-
bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
309-
310-
assert(Stack.back().first == BB);
311-
312-
Value *Exec = popSaved();
313-
Instruction *ExecDef = dyn_cast<Instruction>(Exec);
314-
BasicBlock *DefBB = ExecDef->getParent();
315-
for (auto Pred : predecessors(BB)) {
316-
llvm::Loop *L = LI->getLoopFor(Pred);
317-
bool IsLoopLatch = false;
318-
if (L) {
319-
SmallVector<BasicBlock *, 4> LL;
320-
L->getLoopLatches(LL);
321-
IsLoopLatch = std::find_if(LL.begin(), LL.end(), [Pred](BasicBlock *B) {
322-
return B == Pred;
323-
}) != LL.end();
324-
}
325-
if (Pred != DefBB && DT->dominates(DefBB, Pred) && !IsLoopLatch) {
326-
BasicBlock::iterator InsPt(Pred->getTerminator());
327-
IRBuilder<>(Pred, InsPt).CreateCall(EndCf, {Exec});
328-
}
329-
}
308+
bool SIAnnotateControlFlow::insertWaveReconverge(BasicBlock *BB) {
309+
assert(succ_empty(BB) || succ_size(BB) == 1);
310+
311+
if (succ_empty(BB))
312+
return false;
313+
314+
BasicBlock *SingleSucc = *succ_begin(BB);
315+
BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
316+
BasicBlock::iterator InsPt = Term ? BasicBlock::iterator(Term) : BB->end();
317+
318+
if (isTopOfStack(SingleSucc)) {
319+
Value *Exec = Stack.back().second;
320+
IRBuilder<>(BB, InsPt).CreateCall(WaveReconverge, {Exec});
321+
}
330322

331323
return true;
332324
}
@@ -349,14 +341,16 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
349341

350342
if (!Term || Term->isUnconditional()) {
351343
if (isTopOfStack(BB))
352-
Changed |= closeControlFlow(BB);
344+
Stack.pop_back();
345+
346+
insertWaveReconverge(BB);
353347

354348
continue;
355349
}
356350

357351
if (I.nodeVisited(Term->getSuccessor(1))) {
358352
if (isTopOfStack(BB))
359-
Changed |= closeControlFlow(BB);
353+
Stack.pop_back();
360354

361355
if (DT->dominates(Term->getSuccessor(1), BB))
362356
Changed |= handleLoop(Term);
@@ -371,7 +365,7 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
371365
continue;
372366
}
373367

374-
Changed |= closeControlFlow(BB);
368+
Stack.pop_back();
375369
}
376370

377371
Changed |= openIf(Term);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6299,7 +6299,7 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
62996299
return AMDGPUISD::ELSE;
63006300
case Intrinsic::amdgcn_loop:
63016301
return AMDGPUISD::LOOP;
6302-
case Intrinsic::amdgcn_end_cf:
6302+
case Intrinsic::amdgcn_wave_reconverge:
63036303
llvm_unreachable("should not occur");
63046304
default:
63056305
return 0;
@@ -9940,8 +9940,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
99409940

99419941
return SDValue(Load, 0);
99429942
}
9943-
case Intrinsic::amdgcn_end_cf:
9944-
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9943+
case Intrinsic::amdgcn_wave_reconverge:
9944+
return SDValue(DAG.getMachineNode(AMDGPU::SI_WAVE_RECONVERGE, DL, MVT::Other,
99459945
Op->getOperand(2), Chain), 0);
99469946
case Intrinsic::amdgcn_s_barrier_init:
99479947
case Intrinsic::amdgcn_s_barrier_join:
@@ -15741,12 +15741,12 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
1574115741
}
1574215742

1574315743
// ISel inserts copy to regs for the successor PHIs
15744-
// at the BB end. We need to move the SI_END_CF right before the branch.
15745-
// Even we don't have to move SI_END_CF we need to take care of the
15746-
// S_CBRANCH_SCC0/1 as SI_END_CF overwrites SCC
15744+
// at the BB end. We need to move the SI_WAVE_RECONVERGE right before the branch.
15745+
// Even we don't have to move SI_WAVE_RECONVERGE we need to take care of the
15746+
// S_CBRANCH_SCC0/1 as SI_WAVE_RECONVERGE overwrites SCC
1574715747
for (auto &MBB : MF) {
1574815748
for (auto &MI : MBB) {
15749-
if (MI.getOpcode() == AMDGPU::SI_END_CF) {
15749+
if (MI.getOpcode() == AMDGPU::SI_WAVE_RECONVERGE) {
1575015750
MachineBasicBlock::iterator I(MI);
1575115751
MachineBasicBlock::iterator Next = std::next(I);
1575215752
bool NeedToMove = false;
@@ -15755,7 +15755,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
1575515755
Next++;
1575615756
}
1575715757

15758-
// Lets take care of SCC users as S_END_CF defines SCC
15758+
// Lets take care of SCC users as SI_WAVE_RECONVERGE defines SCC
1575915759
bool NeedPreserveSCC =
1576015760
Next != MBB.end() && Next->readsRegister(AMDGPU::SCC);
1576115761
MachineBasicBlock::iterator SCCDefUse(Next);
@@ -16421,7 +16421,7 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
1642116421
default:
1642216422
Result = false;
1642316423
break;
16424-
case Intrinsic::amdgcn_end_cf:
16424+
case Intrinsic::amdgcn_wave_reconverge:
1642516425
case Intrinsic::amdgcn_loop:
1642616426
Result = true;
1642716427
break;

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3102,7 +3102,7 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
31023102
break;
31033103
case AMDGPU::SI_IF:
31043104
case AMDGPU::SI_ELSE:
3105-
case AMDGPU::SI_END_CF:
3105+
case AMDGPU::SI_WAVE_RECONVERGE:
31063106
case AMDGPU::SI_KILL_I1_TERMINATOR:
31073107
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
31083108
// FIXME: It's messy that these need to be considered here at all.
@@ -8783,7 +8783,7 @@ void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
87838783
.add(Branch->getOperand(0))
87848784
.add(Branch->getOperand(1));
87858785
MachineInstr *SIEND =
8786-
BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
8786+
BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_WAVE_RECONVERGE))
87878787
.addReg(DstReg);
87888788

87898789
IfEntry->erase(TI);

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,7 @@ def SI_LOOP : CFPseudoInstSI <
475475
let IsNeverUniform = 1;
476476
}
477477

478-
def SI_END_CF : CFPseudoInstSI <
478+
def SI_WAVE_RECONVERGE : CFPseudoInstSI <
479479
(outs), (ins SReg_1:$saved), [], 1, 1> {
480480
let Size = 4;
481481
let isAsCheapAsAMove = 1;

llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp

Lines changed: 7 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
/// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0
2626
/// %sgpr0 = SI_ELSE %sgpr0
2727
/// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0
28-
/// SI_END_CF %sgpr0
28+
/// SI_WAVE_RECONVERGE %sgpr0
2929
///
3030
/// becomes:
3131
///
@@ -103,10 +103,7 @@ class SILowerControlFlow : public MachineFunctionPass {
103103
void emitWaveDiverge(MachineInstr &MI, Register EnabledLanesMask,
104104
Register DisableLanesMask);
105105

106-
void emitWaveInvert(MachineInstr &MI, Register EnabledLanesMask,
107-
Register DisableLanesMask);
108-
109-
void emitEndCf(MachineInstr &MI);
106+
void emitWaveReconverge(MachineInstr &MI);
110107

111108
void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
112109

@@ -198,7 +195,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
198195
void SILowerControlFlow::emitElse(MachineInstr &MI) {
199196
Register InvCondReg = MI.getOperand(0).getReg();
200197
Register CondReg = MI.getOperand(1).getReg();
201-
emitWaveInvert(MI, CondReg, InvCondReg);
198+
emitWaveDiverge(MI, CondReg, InvCondReg);
202199
}
203200

204201
void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
@@ -375,89 +372,7 @@ void SILowerControlFlow::emitWaveDiverge(MachineInstr &MI,
375372
LIS->removeAllRegUnitsForPhysReg(Exec);
376373
}
377374

378-
void SILowerControlFlow::emitWaveInvert(MachineInstr &MI,
379-
Register EnabledLanesMask,
380-
Register DisableLanesMask) {
381-
MachineBasicBlock &MBB = *MI.getParent();
382-
const DebugLoc &DL = MI.getDebugLoc();
383-
MachineBasicBlock::iterator I(MI);
384-
385-
MachineInstr *CondInverted =
386-
BuildMI(MBB, I, DL, TII->get(XorOpc), DisableLanesMask)
387-
.addReg(EnabledLanesMask)
388-
.addReg(Exec);
389-
390-
if (LV) {
391-
LV->replaceKillInstruction(DisableLanesMask, MI, *CondInverted);
392-
}
393-
394-
Register TestResultReg = MRI->createVirtualRegister(BoolRC);
395-
// If the EnableLanesMask is zero we have to restore the masked bits on the
396-
// skip way
397-
Register ExitMask = MRI->createVirtualRegister(BoolRC);
398-
MachineInstr *ExitMaskSet = BuildMI(MBB, I, DL, TII->get(OrOpc), ExitMask)
399-
.addReg(Exec)
400-
.addReg(DisableLanesMask);
401-
402-
MachineInstr *IfZeroMask =
403-
BuildMI(MBB, I, DL, TII->get(AndOpc), TestResultReg)
404-
.addReg(EnabledLanesMask)
405-
.addImm(TestMask);
406-
407-
MachineInstr *SetExecForSucc = BuildMI(MBB, I, DL, TII->get(Select), Exec)
408-
.addReg(EnabledLanesMask)
409-
.addReg(ExitMask);
410-
411-
MachineBasicBlock *FlowBB = MI.getOperand(2).getMBB();
412-
MachineBasicBlock *TargetBB = nullptr;
413-
// determine target BBs
414-
I = skipToUncondBrOrEnd(MBB, I);
415-
if (I != MBB.end()) {
416-
// skipToUncondBrOrEnd returns either unconditional branch or end()
417-
TargetBB = I->getOperand(0).getMBB();
418-
I->getOperand(0).setMBB(FlowBB);
419-
} else {
420-
// assert(MBB.succ_size() == 2);
421-
for (auto Succ : successors(&MBB)) {
422-
if (Succ != FlowBB) {
423-
TargetBB = Succ;
424-
break;
425-
}
426-
}
427-
I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(FlowBB);
428-
if (LIS)
429-
LIS->InsertMachineInstrInMaps(*I);
430-
}
431-
432-
if (TargetBB) {
433-
MachineInstr *NewBr =
434-
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)).addMBB(TargetBB);
435-
if (LIS)
436-
LIS->InsertMachineInstrInMaps(*NewBr);
437-
}
438-
439-
if (!LIS) {
440-
MI.eraseFromParent();
441-
return;
442-
}
443-
444-
LIS->InsertMachineInstrInMaps(*CondInverted);
445-
LIS->InsertMachineInstrInMaps(*ExitMaskSet);
446-
LIS->InsertMachineInstrInMaps(*IfZeroMask);
447-
LIS->ReplaceMachineInstrInMaps(MI, *SetExecForSucc);
448-
449-
RecomputeRegs.insert(MI.getOperand(0).getReg());
450-
RecomputeRegs.insert(MI.getOperand(1).getReg());
451-
452-
MI.eraseFromParent();
453-
454-
LIS->createAndComputeVirtRegInterval(TestResultReg);
455-
LIS->createAndComputeVirtRegInterval(ExitMask);
456-
457-
LIS->removeAllRegUnitsForPhysReg(Exec);
458-
}
459-
460-
void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
375+
void SILowerControlFlow::emitWaveReconverge(MachineInstr &MI) {
461376

462377
MachineBasicBlock &BB = *MI.getParent();
463378
Register Mask = MI.getOperand(0).getReg();
@@ -558,8 +473,8 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
558473
MI.setDesc(TII->get(AMDGPU::S_CBRANCH_EXECNZ));
559474
break;
560475

561-
case AMDGPU::SI_END_CF:
562-
emitEndCf(MI);
476+
case AMDGPU::SI_WAVE_RECONVERGE:
477+
emitWaveReconverge(MI);
563478
break;
564479

565480
default:
@@ -762,7 +677,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
762677
case AMDGPU::SI_IF_BREAK:
763678
case AMDGPU::SI_WATERFALL_LOOP:
764679
case AMDGPU::SI_LOOP:
765-
case AMDGPU::SI_END_CF:
680+
case AMDGPU::SI_WAVE_RECONVERGE:
766681
SplitMBB = process(MI);
767682
Changed = true;
768683
break;

llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ body: |
5353
5454
%5:sreg_32 = PHI %14, %bb.0, %3, %bb.1
5555
%6:vreg_1 = PHI %1, %bb.0, %4, %bb.1
56-
SI_END_CF %2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
56+
SI_WAVE_RECONVERGE %2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
5757
%27:sreg_64 = COPY %6
5858
%7:sreg_64 = SI_IF %27, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
5959
S_BRANCH %bb.3
@@ -65,7 +65,7 @@ body: |
6565
6666
bb.4:
6767
%9:vgpr_32 = PHI %5, %bb.2, %8, %bb.3
68-
SI_END_CF %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
68+
SI_WAVE_RECONVERGE %7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
6969
%28:sreg_64 = IMPLICIT_DEF
7070
%29:vreg_64 = COPY %28
7171
GLOBAL_STORE_DWORD killed %29, %9, 0, 0, implicit $exec

0 commit comments

Comments
 (0)