7
7
// ===----------------------------------------------------------------------===//
8
8
//
9
9
// / \file
10
- // / This pass tries to remove unnecessary VGPR live range in divergent if-else
11
- // / structure .
10
+ // / This pass tries to remove unnecessary VGPR live ranges in divergent if-else
11
+ // / structures and waterfall loops .
12
12
// /
13
- // / When we do structurization, we usually transform a if-else into two
13
+ // / When we do structurization, we usually transform an if-else into two
14
14
// / sucessive if-then (with a flow block to do predicate inversion). Consider a
15
15
// / simple case after structurization: A divergent value %a was defined before
16
16
// / if-else and used in both THEN (use in THEN is optional) and ELSE part:
29
29
// /
30
30
// / As register allocator has no idea of the thread-control-flow, it will just
31
31
// / assume %a would be alive in the whole range of bb.then because of a later
32
- // / use in bb.else. On AMDGPU architecture, the VGPR was accessed with respect
32
+ // / use in bb.else. On AMDGPU architecture, the VGPR is accessed with respect
33
33
// / to exec mask. For this if-else case, the lanes active in bb.then will be
34
- // / inactive in bb.else, and vice-verse . So we are safe to say that %a was dead
35
- // / after the last use in bb.then untill the end of the block. The reason is
34
+ // / inactive in bb.else, and vice-versa . So we are safe to say that %a was dead
35
+ // / after the last use in bb.then until the end of the block. The reason is
36
36
// / the instructions in bb.then will only overwrite lanes that will never be
37
37
// / accessed in bb.else.
38
38
// /
46
46
// / sure the second loop iteration still get correct data.
47
47
// / 2.) There should be no further uses after the IF-ELSE region.
48
48
// /
49
+ // /
50
+ // / Waterfall loops get inserted around instructions that use divergent values
51
+ // / but can only be executed with a uniform value. For example an indirect call
52
+ // / to a divergent address:
53
+ // / bb.start:
54
+ // / %a = ...
55
+ // / %fun = ...
56
+ // / ...
57
+ // / bb.loop:
58
+ // / call %fun (%a)
59
+ // / ... // %a can be dead here
60
+ // / loop %bb.loop
61
+ // /
62
+ // / The loop block is executed multiple times, but it is run exactly once for
63
+ // / each active lane. Similar to the if-else case, the register allocator
64
+ // / assumes that %a is live throughout the loop as it is used again in the next
65
+ // / iteration. If %a is a VGPR that is unused after the loop, it does not need
66
+ // / to be live after its last use in the loop block. By inserting a phi-node at
67
+ // / the start of bb.loop that is undef when coming from bb.loop, the register
68
+ // / allocation knows that the value of %a does not need to be preserved through
69
+ // / iterations of the loop.
70
+ // /
49
71
//
50
72
// ===----------------------------------------------------------------------===//
51
73
@@ -89,6 +111,10 @@ class SIOptimizeVGPRLiveRange : public MachineFunctionPass {
89
111
SmallSetVector<MachineBasicBlock *, 16 > &ElseBlocks,
90
112
SmallVectorImpl<Register> &CandidateRegs) const ;
91
113
114
+ void collectWaterfallCandidateRegisters (
115
+ MachineBasicBlock *Loop,
116
+ SmallSetVector<Register, 16 > &CandidateRegs) const ;
117
+
92
118
void findNonPHIUsesInBlock (Register Reg, MachineBasicBlock *MBB,
93
119
SmallVectorImpl<MachineInstr *> &Uses) const ;
94
120
@@ -105,6 +131,8 @@ class SIOptimizeVGPRLiveRange : public MachineFunctionPass {
105
131
MachineBasicBlock *Flow, MachineBasicBlock *Endif,
106
132
SmallSetVector<MachineBasicBlock *, 16 > &ElseBlocks) const ;
107
133
134
+ void optimizeWaterfallLiveRange (Register Reg, MachineBasicBlock *If) const ;
135
+
108
136
SIOptimizeVGPRLiveRange () : MachineFunctionPass(ID) {}
109
137
110
138
bool runOnMachineFunction (MachineFunction &MF) override ;
@@ -278,6 +306,54 @@ void SIOptimizeVGPRLiveRange::collectCandidateRegisters(
278
306
}
279
307
}
280
308
309
+ // / Collect the registers used in the waterfall loop block that are defined
310
+ // / before.
311
+ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters (
312
+ MachineBasicBlock *Loop,
313
+ SmallSetVector<Register, 16 > &CandidateRegs) const {
314
+
315
+ for (auto &MI : Loop->instrs ()) {
316
+ if (MI.isDebugInstr ())
317
+ continue ;
318
+
319
+ for (auto &MO : MI.operands ()) {
320
+ if (!MO.isReg () || !MO.getReg () || MO.isDef ())
321
+ continue ;
322
+
323
+ Register MOReg = MO.getReg ();
324
+ // We can only optimize AGPR/VGPR virtual register
325
+ if (MOReg.isPhysical () || !TRI->isVectorRegister (*MRI, MOReg))
326
+ continue ;
327
+
328
+ if (MO.readsReg ()) {
329
+ const MachineBasicBlock *DefMBB = MRI->getVRegDef (MOReg)->getParent ();
330
+ // Make sure the value is defined before the LOOP block
331
+ if (DefMBB != Loop && !CandidateRegs.contains (MOReg)) {
332
+ // If the variable is used after the loop, the register coalescer will
333
+ // merge the newly created register and remove the phi node again.
334
+ // Just do nothing in that case.
335
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo (MOReg);
336
+ bool IsUsed = false ;
337
+ for (auto *Succ : Loop->successors ()) {
338
+ if (Succ != Loop && OldVarInfo.isLiveIn (*Succ, MOReg, *MRI)) {
339
+ IsUsed = true ;
340
+ break ;
341
+ }
342
+ }
343
+ if (!IsUsed) {
344
+ LLVM_DEBUG (dbgs () << " Found candidate reg: "
345
+ << printReg (MOReg, TRI, 0 , MRI) << ' \n ' );
346
+ CandidateRegs.insert (MOReg);
347
+ } else {
348
+ LLVM_DEBUG (dbgs () << " Reg is used after loop, ignoring: "
349
+ << printReg (MOReg, TRI, 0 , MRI) << ' \n ' );
350
+ }
351
+ }
352
+ }
353
+ }
354
+ }
355
+ }
356
+
281
357
// Re-calculate the liveness of \p Reg in the THEN-region
282
358
void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion (
283
359
Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const {
@@ -403,12 +479,8 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange(
403
479
}
404
480
405
481
// Replace all uses in the ELSE region or the PHIs in ENDIF block
406
- for (auto I = MRI->use_begin (Reg), E = MRI->use_end (); I != E;) {
407
- MachineOperand &O = *I;
408
- // This is a little bit tricky, the setReg() will update the linked list,
409
- // so we have to increment the iterator before setReg() to avoid skipping
410
- // some uses.
411
- ++I;
482
+ // Use early increment range because setReg() will update the linked list.
483
+ for (auto &O : make_early_inc_range (MRI->use_operands (Reg))) {
412
484
auto *UseMI = O.getParent ();
413
485
auto *UseBlock = UseMI->getParent ();
414
486
// Replace uses in Endif block
@@ -431,6 +503,53 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange(
431
503
updateLiveRangeInThenRegion (Reg, If, Flow);
432
504
}
433
505
506
+ void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange (
507
+ Register Reg, MachineBasicBlock *Loop) const {
508
+ // Insert a new PHI, marking the value from the last loop iteration undef.
509
+ LLVM_DEBUG (dbgs () << " Optimizing " << printReg (Reg, TRI) << ' \n ' );
510
+ const auto *RC = MRI->getRegClass (Reg);
511
+ Register NewReg = MRI->createVirtualRegister (RC);
512
+ Register UndefReg = MRI->createVirtualRegister (RC);
513
+
514
+ // Replace all uses in the LOOP region
515
+ // Use early increment range because setReg() will update the linked list.
516
+ for (auto &O : make_early_inc_range (MRI->use_operands (Reg))) {
517
+ auto *UseMI = O.getParent ();
518
+ auto *UseBlock = UseMI->getParent ();
519
+ // Replace uses in Loop block
520
+ if (UseBlock == Loop)
521
+ O.setReg (NewReg);
522
+ }
523
+
524
+ MachineInstrBuilder PHI = BuildMI (*Loop, Loop->getFirstNonPHI (), DebugLoc (),
525
+ TII->get (TargetOpcode::PHI), NewReg);
526
+ for (auto *Pred : Loop->predecessors ()) {
527
+ if (Pred == Loop)
528
+ PHI.addReg (UndefReg, RegState::Undef).addMBB (Pred);
529
+ else
530
+ PHI.addReg (Reg).addMBB (Pred);
531
+ }
532
+
533
+ LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo (NewReg);
534
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo (Reg);
535
+
536
+ // collectWaterfallCandidateRegisters only collects registers that are dead
537
+ // after the loop. So we know that the old reg is not live throughout the
538
+ // whole block anymore.
539
+ OldVarInfo.AliveBlocks .reset (Loop->getNumber ());
540
+
541
+ // Mark the last use as kill
542
+ for (auto &MI : reverse (Loop->instrs ())) {
543
+ if (MI.readsRegister (NewReg, TRI)) {
544
+ MI.addRegisterKilled (NewReg, TRI);
545
+ NewVarInfo.Kills .push_back (&MI);
546
+ break ;
547
+ }
548
+ }
549
+ assert (!NewVarInfo.Kills .empty () &&
550
+ " Failed to find last usage of register in loop" );
551
+ }
552
+
434
553
char SIOptimizeVGPRLiveRange::ID = 0 ;
435
554
436
555
INITIALIZE_PASS_BEGIN (SIOptimizeVGPRLiveRange, DEBUG_TYPE,
@@ -491,6 +610,16 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
491
610
// Now we are safe to optimize.
492
611
for (auto Reg : CandidateRegs)
493
612
optimizeLiveRange (Reg, &MBB, IfTarget, Endif, ElseBlocks);
613
+ } else if (MI.getOpcode () == AMDGPU::SI_WATERFALL_LOOP) {
614
+ LLVM_DEBUG (dbgs () << " Checking Waterfall loop: "
615
+ << printMBBReference (MBB) << ' \n ' );
616
+
617
+ SmallSetVector<Register, 16 > CandidateRegs;
618
+ collectWaterfallCandidateRegisters (&MBB, CandidateRegs);
619
+ MadeChange |= !CandidateRegs.empty ();
620
+ // Now we are safe to optimize.
621
+ for (auto Reg : CandidateRegs)
622
+ optimizeWaterfallLiveRange (Reg, &MBB);
494
623
}
495
624
}
496
625
}
0 commit comments