Skip to content

Commit 4169b41

Browse files
committed
[AMDGPU] Reschedule loads in clauses to improve throughput
After clauses are formed their internal loads can be reordered to facilitate some additional opportunities for overlapping computation. This late stage rescheduling causes no change register pressure.
1 parent 8410bab commit 4169b41

File tree

3 files changed

+329
-3
lines changed

3 files changed

+329
-3
lines changed

llvm/lib/Target/AMDGPU/SIPostRABundler.cpp

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class SIPostRABundler : public MachineFunctionPass {
4444
}
4545

4646
private:
47+
const SIInstrInfo *TII = nullptr;
4748
const SIRegisterInfo *TRI;
4849

4950
SmallSet<Register, 16> Defs;
@@ -54,6 +55,9 @@ class SIPostRABundler : public MachineFunctionPass {
5455
bool isBundleCandidate(const MachineInstr &MI) const;
5556
bool isDependentLoad(const MachineInstr &MI) const;
5657
bool canBundle(const MachineInstr &MI, const MachineInstr &NextMI) const;
58+
void reorderLoads(MachineBasicBlock &MBB,
59+
MachineBasicBlock::instr_iterator &BundleStart,
60+
MachineBasicBlock::instr_iterator Next);
5761
};
5862

5963
constexpr uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
@@ -121,10 +125,132 @@ bool SIPostRABundler::canBundle(const MachineInstr &MI,
121125
!isDependentLoad(NextMI));
122126
}
123127

128+
void SIPostRABundler::reorderLoads(
129+
MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &BundleStart,
130+
MachineBasicBlock::instr_iterator Next) {
131+
auto II = BundleStart;
132+
if (!TII->isMIMG(II->getOpcode()) || II->mayStore())
133+
return;
134+
135+
LLVM_DEBUG(dbgs() << "Begin bundle reorder\n");
136+
137+
// Collect clause
138+
SmallVector<MachineInstr *> Clause;
139+
for (auto II = BundleStart; II != Next; ++II)
140+
Clause.push_back(&*II);
141+
142+
// Search to find the usage distance of each defined register in the clause.
143+
const int MaxSearch = 100;
144+
SmallSet<Register, 16> DefRegs(Defs);
145+
SmallSet<unsigned, 16> Distances;
146+
DenseMap<Register, unsigned> UseDistance;
147+
unsigned Dist = 0;
148+
for (MachineBasicBlock::iterator SearchI = Next;
149+
SearchI != MBB.end() && Dist < MaxSearch && !DefRegs.empty();
150+
++SearchI, ++Dist) {
151+
SmallVector<Register, 4> Found;
152+
// FIXME: fix search efficiency
153+
for (Register DefReg : DefRegs) {
154+
if (SearchI->readsRegister(DefReg, TRI))
155+
Found.push_back(DefReg);
156+
}
157+
for (Register Reg : Found) {
158+
UseDistance[Reg] = Dist;
159+
DefRegs.erase(Reg);
160+
Distances.insert(Dist);
161+
}
162+
}
163+
164+
if (Distances.size() <= 1)
165+
return;
166+
167+
std::vector<std::pair<MachineInstr *, unsigned>> Schedule;
168+
unsigned TotalOrder = Dist + 1;
169+
bool Reorder = false;
170+
for (MachineInstr *MI : Clause) {
171+
unsigned Order = TotalOrder++;
172+
if (MI->getNumExplicitDefs() >= 0) {
173+
Register Reg = MI->defs().begin()->getReg();
174+
if (!UseDistance.contains(Reg))
175+
continue;
176+
Order = std::min(Order, UseDistance[Reg]);
177+
Reorder = true;
178+
}
179+
LLVM_DEBUG(dbgs() << "Order: " << Order << ", MI: " << *MI);
180+
Schedule.push_back(std::pair(MI, Order));
181+
}
182+
183+
if (!Reorder)
184+
return;
185+
186+
std::sort(Schedule.begin(), Schedule.end(),
187+
[](std::pair<MachineInstr *, unsigned> A,
188+
std::pair<MachineInstr *, unsigned> B) {
189+
return A.second < B.second;
190+
});
191+
192+
// Rebuild clause order.
193+
// Schedule holds ideal order for the load operations; however, each def
194+
// can only be scheduled when it will no longer clobber any uses.
195+
Clause.clear();
196+
while (!Schedule.empty()) {
197+
auto It = Schedule.begin();
198+
while (It != Schedule.end()) {
199+
MachineInstr *MI = It->first;
200+
201+
LLVM_DEBUG(dbgs() << "Try schedule: " << *MI);
202+
203+
if (MI->getNumExplicitDefs() == 0) {
204+
// No defs, always schedule.
205+
Clause.push_back(MI);
206+
break;
207+
}
208+
209+
// FIXME: make this scan more efficient
210+
Register Reg = MI->defs().begin()->getReg();
211+
bool ClobbersUse = false;
212+
for (auto SearchIt = Schedule.begin(); SearchIt != Schedule.end();
213+
++SearchIt) {
214+
// We are allowed to clobber our own uses.
215+
if (SearchIt == It)
216+
continue;
217+
if (SearchIt->first->readsRegister(Reg, TRI)) {
218+
ClobbersUse = true;
219+
break;
220+
}
221+
}
222+
if (ClobbersUse) {
223+
// Use is clobbered; try next def in the schedule.
224+
It++;
225+
LLVM_DEBUG(dbgs() << " Clobbers uses\n");
226+
continue;
227+
}
228+
229+
// Safe to schedule.
230+
LLVM_DEBUG(dbgs() << " OK!\n");
231+
Clause.push_back(MI);
232+
break;
233+
}
234+
assert(It != Schedule.end());
235+
Schedule.erase(It);
236+
}
237+
238+
// Apply order to instructions.
239+
for (MachineInstr *MI : Clause)
240+
MI->moveBefore(&*Next);
241+
242+
// FIXME: update kill flags
243+
244+
// Update start of bundle.
245+
BundleStart = Clause[0]->getIterator();
246+
}
247+
124248
bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
125249
if (skipFunction(MF.getFunction()))
126250
return false;
127251

252+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
253+
TII = ST.getInstrInfo();
128254
TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
129255
BitVector BundleUsedRegUnits(TRI->getNumRegUnits());
130256
BitVector KillUsedRegUnits(TRI->getNumRegUnits());
@@ -214,6 +340,7 @@ bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
214340
BundleUsedRegUnits.reset();
215341
}
216342

343+
reorderLoads(MBB, BundleStart, Next);
217344
finalizeBundle(MBB, BundleStart, Next);
218345
}
219346

llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,16 @@ define void @issue92561(ptr addrspace(1) %arg) {
6161
; SDAG-NEXT: s_mov_b32 s7, s12
6262
; SDAG-NEXT: s_clause 0x2
6363
; SDAG-NEXT: image_sample_c_lz v0, [v1, v1, v0, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
64-
; SDAG-NEXT: image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
6564
; SDAG-NEXT: image_sample_c_lz v2, [v1, v2, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
65+
; SDAG-NEXT: image_sample_c_lz v3, [v1, v1, v1, v1], s[0:7], s[12:15] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
6666
; SDAG-NEXT: v_mov_b32_e32 v4, v1
6767
; SDAG-NEXT: s_waitcnt vmcnt(2)
6868
; SDAG-NEXT: v_add_f32_e32 v0, v9, v0
69-
; SDAG-NEXT: s_waitcnt vmcnt(0)
70-
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
69+
; SDAG-NEXT: s_waitcnt vmcnt(1)
70+
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
7171
; SDAG-NEXT: v_add_f32_e32 v0, v2, v0
7272
; SDAG-NEXT: v_mov_b32_e32 v2, v1
73+
; SDAG-NEXT: s_waitcnt vmcnt(0)
7374
; SDAG-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_mov_b32 v3, v1
7475
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
7576
; SDAG-NEXT: v_mul_f32_e32 v0, 0x3e800000, v0

0 commit comments

Comments
 (0)