Skip to content

Commit 1f52d02

Browse files
committed
[AMDGPU] Split waterfall loop exec manipulation
Split waterfall loops into multiple blocks so that exec mask manipulation (s_and_saveexec) does not occur in the middle of a block. VGPR live range optimizer is updated to handle waterfall loops spanning multiple blocks. Reviewed By: ruiling Differential Revision: https://reviews.llvm.org/D122200
1 parent 12f0802 commit 1f52d02

File tree

42 files changed

+1438
-861
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1438
-861
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -741,16 +741,19 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
741741
// To insert the loop we need to split the block. Move everything before this
742742
// point to a new block, and insert a new empty block before this instruction.
743743
MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
744+
MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
744745
MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
745746
MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
746747
MachineFunction::iterator MBBI(MBB);
747748
++MBBI;
748749
MF->insert(MBBI, LoopBB);
750+
MF->insert(MBBI, BodyBB);
749751
MF->insert(MBBI, RestoreExecBB);
750752
MF->insert(MBBI, RemainderBB);
751753

752-
LoopBB->addSuccessor(RestoreExecBB);
753-
LoopBB->addSuccessor(LoopBB);
754+
LoopBB->addSuccessor(BodyBB);
755+
BodyBB->addSuccessor(RestoreExecBB);
756+
BodyBB->addSuccessor(LoopBB);
754757

755758
// Move the rest of the block into a new block.
756759
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
@@ -762,26 +765,26 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
762765
B.setInsertPt(*LoopBB, LoopBB->end());
763766

764767
B.buildInstr(TargetOpcode::PHI)
765-
.addDef(PhiExec)
766-
.addReg(InitSaveExecReg)
767-
.addMBB(&MBB)
768-
.addReg(NewExec)
769-
.addMBB(LoopBB);
768+
.addDef(PhiExec)
769+
.addReg(InitSaveExecReg)
770+
.addMBB(&MBB)
771+
.addReg(NewExec)
772+
.addMBB(BodyBB);
770773

771774
const DebugLoc &DL = B.getDL();
772775

773776
MachineInstr &FirstInst = *Range.begin();
774777

775-
// Move the instruction into the loop. Note we moved everything after
778+
// Move the instruction into the loop body. Note we moved everything after
776779
// Range.end() already into a new block, so Range.end() is no longer valid.
777-
LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
780+
BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
778781

779782
// Figure out the iterator range after splicing the instructions.
780783
MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
781-
auto NewEnd = LoopBB->end();
784+
auto NewEnd = BodyBB->end();
782785

783-
MachineBasicBlock::iterator I = Range.begin();
784-
B.setInsertPt(*LoopBB, I);
786+
MachineBasicBlock::iterator I = LoopBB->end();
787+
B.setMBB(*LoopBB);
785788

786789
Register CondReg;
787790

@@ -813,7 +816,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
813816
B.setMBB(MBB);
814817
OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
815818
MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
816-
B.setInstr(*I);
819+
B.setMBB(*LoopBB);
817820
}
818821

819822
unsigned OpSize = OpTy.getSizeInBits();
@@ -879,7 +882,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
879882
for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx)
880883
UnmergePieces.push_back(Unmerge.getReg(PieceIdx));
881884
}
882-
B.setInstr(*I);
885+
B.setMBB(*LoopBB);
883886

884887
for (Register UnmergePiece : UnmergePieces) {
885888
Register CurrentLaneOpReg;
@@ -978,7 +981,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
978981

979982
MRI.setSimpleHint(NewExec, CondReg);
980983

981-
B.setInsertPt(*LoopBB, LoopBB->end());
984+
B.setInsertPt(*BodyBB, BodyBB->end());
982985

983986
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
984987
B.buildInstr(XorTermOpc)

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5464,7 +5464,8 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
54645464
static void
54655465
emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
54665466
MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5467-
const DebugLoc &DL, MachineOperand &Rsrc) {
5467+
MachineBasicBlock &BodyBB, const DebugLoc &DL,
5468+
MachineOperand &Rsrc) {
54685469
MachineFunction &MF = *OrigBB.getParent();
54695470
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
54705471
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -5557,14 +5558,14 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
55575558
.addReg(CondReg, RegState::Kill);
55585559

55595560
// The original instruction is here; we insert the terminators after it.
5560-
I = LoopBB.end();
5561+
I = BodyBB.end();
55615562

55625563
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
5563-
BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec)
5564+
BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
55645565
.addReg(Exec)
55655566
.addReg(SaveExec);
55665567

5567-
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
5568+
BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
55685569
}
55695570

55705571
// Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
@@ -5611,44 +5612,48 @@ loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
56115612
// To insert the loop we need to split the block. Move everything after this
56125613
// point to a new block, and insert a new empty block between the two.
56135614
MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
5615+
MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
56145616
MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
56155617
MachineFunction::iterator MBBI(MBB);
56165618
++MBBI;
56175619

56185620
MF.insert(MBBI, LoopBB);
5621+
MF.insert(MBBI, BodyBB);
56195622
MF.insert(MBBI, RemainderBB);
56205623

5621-
LoopBB->addSuccessor(LoopBB);
5622-
LoopBB->addSuccessor(RemainderBB);
5624+
LoopBB->addSuccessor(BodyBB);
5625+
BodyBB->addSuccessor(LoopBB);
5626+
BodyBB->addSuccessor(RemainderBB);
56235627

5624-
// Move Begin to MI to the LoopBB, and the remainder of the block to
5628+
// Move Begin to MI to the BodyBB, and the remainder of the block to
56255629
// RemainderBB.
56265630
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
56275631
RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
5628-
LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end());
5632+
BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
56295633

56305634
MBB.addSuccessor(LoopBB);
56315635

56325636
// Update dominators. We know that MBB immediately dominates LoopBB, that
5633-
// LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
5634-
// dominates all of the successors transferred to it from MBB that MBB used
5635-
// to properly dominate.
5637+
// LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
5638+
// RemainderBB. RemainderBB immediately dominates all of the successors
5639+
// transferred to it from MBB that MBB used to properly dominate.
56365640
if (MDT) {
56375641
MDT->addNewBlock(LoopBB, &MBB);
5638-
MDT->addNewBlock(RemainderBB, LoopBB);
5642+
MDT->addNewBlock(BodyBB, LoopBB);
5643+
MDT->addNewBlock(RemainderBB, BodyBB);
56395644
for (auto &Succ : RemainderBB->successors()) {
56405645
if (MDT->properlyDominates(&MBB, Succ)) {
56415646
MDT->changeImmediateDominator(Succ, RemainderBB);
56425647
}
56435648
}
56445649
}
56455650

5646-
emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
5651+
emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, Rsrc);
56475652

56485653
// Restore the EXEC mask
56495654
MachineBasicBlock::iterator First = RemainderBB->begin();
56505655
BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
5651-
return LoopBB;
5656+
return BodyBB;
56525657
}
56535658

56545659
// Extract pointer from Rsrc and return a zero-value Rsrc replacement.

llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp

Lines changed: 83 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,10 @@ class SIOptimizeVGPRLiveRange : public MachineFunctionPass {
112112
SmallVectorImpl<Register> &CandidateRegs) const;
113113

114114
void collectWaterfallCandidateRegisters(
115-
MachineBasicBlock *Loop,
116-
SmallSetVector<Register, 16> &CandidateRegs) const;
115+
MachineBasicBlock *LoopHeader, MachineBasicBlock *LoopEnd,
116+
SmallSetVector<Register, 16> &CandidateRegs,
117+
SmallSetVector<MachineBasicBlock *, 2> &Blocks,
118+
SmallVectorImpl<MachineInstr *> &Instructions) const;
117119

118120
void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB,
119121
SmallVectorImpl<MachineInstr *> &Uses) const;
@@ -131,7 +133,10 @@ class SIOptimizeVGPRLiveRange : public MachineFunctionPass {
131133
MachineBasicBlock *Flow, MachineBasicBlock *Endif,
132134
SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const;
133135

134-
void optimizeWaterfallLiveRange(Register Reg, MachineBasicBlock *If) const;
136+
void optimizeWaterfallLiveRange(
137+
Register Reg, MachineBasicBlock *LoopHeader,
138+
SmallSetVector<MachineBasicBlock *, 2> &LoopBlocks,
139+
SmallVectorImpl<MachineInstr *> &Instructions) const;
135140

136141
SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {}
137142

@@ -323,12 +328,30 @@ void SIOptimizeVGPRLiveRange::collectCandidateRegisters(
323328
/// Collect the registers used in the waterfall loop block that are defined
324329
/// before.
325330
void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
326-
MachineBasicBlock *Loop,
327-
SmallSetVector<Register, 16> &CandidateRegs) const {
331+
MachineBasicBlock *LoopHeader, MachineBasicBlock *LoopEnd,
332+
SmallSetVector<Register, 16> &CandidateRegs,
333+
SmallSetVector<MachineBasicBlock *, 2> &Blocks,
334+
SmallVectorImpl<MachineInstr *> &Instructions) const {
335+
336+
// Collect loop instructions, potentially spanning multiple blocks
337+
auto *MBB = LoopHeader;
338+
for (;;) {
339+
Blocks.insert(MBB);
340+
for (auto &MI : *MBB) {
341+
if (MI.isDebugInstr())
342+
continue;
343+
Instructions.push_back(&MI);
344+
}
345+
if (MBB == LoopEnd)
346+
break;
347+
assert(MBB->pred_size() == 1 ||
348+
(MBB == LoopHeader && MBB->pred_size() == 2));
349+
assert(MBB->succ_size() == 1);
350+
MBB = *MBB->succ_begin();
351+
}
328352

329-
for (auto &MI : Loop->instrs()) {
330-
if (MI.isDebugInstr())
331-
continue;
353+
for (auto *I : Instructions) {
354+
auto &MI = *I;
332355

333356
for (auto &MO : MI.operands()) {
334357
if (!MO.isReg() || !MO.getReg() || MO.isDef())
@@ -340,16 +363,17 @@ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
340363
continue;
341364

342365
if (MO.readsReg()) {
343-
const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
366+
MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
344367
// Make sure the value is defined before the LOOP block
345-
if (DefMBB != Loop && !CandidateRegs.contains(MOReg)) {
368+
if (!Blocks.contains(DefMBB) && !CandidateRegs.contains(MOReg)) {
346369
// If the variable is used after the loop, the register coalescer will
347370
// merge the newly created register and remove the phi node again.
348371
// Just do nothing in that case.
349372
LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(MOReg);
350373
bool IsUsed = false;
351-
for (auto *Succ : Loop->successors()) {
352-
if (Succ != Loop && OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) {
374+
for (auto *Succ : LoopEnd->successors()) {
375+
if (!Blocks.contains(Succ) &&
376+
OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) {
353377
IsUsed = true;
354378
break;
355379
}
@@ -513,7 +537,9 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange(
513537
}
514538

515539
void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange(
516-
Register Reg, MachineBasicBlock *Loop) const {
540+
Register Reg, MachineBasicBlock *LoopHeader,
541+
SmallSetVector<MachineBasicBlock *, 2> &Blocks,
542+
SmallVectorImpl<MachineInstr *> &Instructions) const {
517543
// Insert a new PHI, marking the value from the last loop iteration undef.
518544
LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n');
519545
const auto *RC = MRI->getRegClass(Reg);
@@ -525,15 +551,16 @@ void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange(
525551
for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) {
526552
auto *UseMI = O.getParent();
527553
auto *UseBlock = UseMI->getParent();
528-
// Replace uses in Loop block
529-
if (UseBlock == Loop)
554+
// Replace uses in Loop blocks
555+
if (Blocks.contains(UseBlock))
530556
O.setReg(NewReg);
531557
}
532558

533-
MachineInstrBuilder PHI = BuildMI(*Loop, Loop->getFirstNonPHI(), DebugLoc(),
534-
TII->get(TargetOpcode::PHI), NewReg);
535-
for (auto *Pred : Loop->predecessors()) {
536-
if (Pred == Loop)
559+
MachineInstrBuilder PHI =
560+
BuildMI(*LoopHeader, LoopHeader->getFirstNonPHI(), DebugLoc(),
561+
TII->get(TargetOpcode::PHI), NewReg);
562+
for (auto *Pred : LoopHeader->predecessors()) {
563+
if (Blocks.contains(Pred))
537564
PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred);
538565
else
539566
PHI.addReg(Reg).addMBB(Pred);
@@ -542,21 +569,36 @@ void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange(
542569
LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg);
543570
LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
544571

545-
// collectWaterfallCandidateRegisters only collects registers that are dead
546-
// after the loop. So we know that the old reg is not live throughout the
547-
// whole block anymore.
548-
OldVarInfo.AliveBlocks.reset(Loop->getNumber());
549-
550-
// Mark the last use as kill
551-
for (auto &MI : reverse(Loop->instrs())) {
552-
if (MI.readsRegister(NewReg, TRI)) {
553-
MI.addRegisterKilled(NewReg, TRI);
554-
NewVarInfo.Kills.push_back(&MI);
572+
// Find last use and mark as kill
573+
MachineInstr *Kill = nullptr;
574+
for (auto *MI : reverse(Instructions)) {
575+
if (MI->readsRegister(NewReg, TRI)) {
576+
MI->addRegisterKilled(NewReg, TRI);
577+
NewVarInfo.Kills.push_back(MI);
578+
Kill = MI;
555579
break;
556580
}
557581
}
558-
assert(!NewVarInfo.Kills.empty() &&
559-
"Failed to find last usage of register in loop");
582+
assert(Kill && "Failed to find last usage of register in loop");
583+
584+
MachineBasicBlock *KillBlock = Kill->getParent();
585+
bool PostKillBlock = false;
586+
for (auto *Block : Blocks) {
587+
auto BBNum = Block->getNumber();
588+
589+
// collectWaterfallCandidateRegisters only collects registers that are dead
590+
// after the loop. So we know that the old reg is no longer live throughout
591+
// the waterfall loop.
592+
OldVarInfo.AliveBlocks.reset(BBNum);
593+
594+
// The new register is live up to (and including) the block that kills it.
595+
PostKillBlock |= (Block == KillBlock);
596+
if (PostKillBlock) {
597+
NewVarInfo.AliveBlocks.reset(BBNum);
598+
} else if (Block != LoopHeader) {
599+
NewVarInfo.AliveBlocks.set(BBNum);
600+
}
601+
}
560602
}
561603

562604
char SIOptimizeVGPRLiveRange::ID = 0;
@@ -620,15 +662,22 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
620662
for (auto Reg : CandidateRegs)
621663
optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks);
622664
} else if (MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP) {
665+
auto *LoopHeader = MI.getOperand(0).getMBB();
666+
auto *LoopEnd = &MBB;
667+
623668
LLVM_DEBUG(dbgs() << "Checking Waterfall loop: "
624-
<< printMBBReference(MBB) << '\n');
669+
<< printMBBReference(*LoopHeader) << '\n');
625670

626671
SmallSetVector<Register, 16> CandidateRegs;
627-
collectWaterfallCandidateRegisters(&MBB, CandidateRegs);
672+
SmallVector<MachineInstr *, 16> Instructions;
673+
SmallSetVector<MachineBasicBlock *, 2> Blocks;
674+
675+
collectWaterfallCandidateRegisters(LoopHeader, LoopEnd, CandidateRegs,
676+
Blocks, Instructions);
628677
MadeChange |= !CandidateRegs.empty();
629678
// Now we are safe to optimize.
630679
for (auto Reg : CandidateRegs)
631-
optimizeWaterfallLiveRange(Reg, &MBB);
680+
optimizeWaterfallLiveRange(Reg, LoopHeader, Blocks, Instructions);
632681
}
633682
}
634683
}

0 commit comments

Comments
 (0)