Skip to content

Commit fe48798

Browse files
committed
[AMDGPU] Reschedule loads in clauses to improve throughput
After clauses are formed their internal loads can be reordered to facilitate some additional opportunities for overlapping computation. This late stage rescheduling causes no change register pressure.
1 parent 4c46ae3 commit fe48798

File tree

70 files changed

+8285
-6907
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+8285
-6907
lines changed

llvm/lib/Target/AMDGPU/SIPostRABundler.cpp

Lines changed: 145 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "GCNSubtarget.h"
1818
#include "llvm/ADT/SmallSet.h"
1919
#include "llvm/CodeGen/MachineFunctionPass.h"
20+
#include <deque>
2021

2122
using namespace llvm;
2223

@@ -50,6 +51,7 @@ class SIPostRABundler {
5051
bool run(MachineFunction &MF);
5152

5253
private:
54+
const SIInstrInfo *TII = nullptr;
5355
const SIRegisterInfo *TRI;
5456

5557
SmallSet<Register, 16> Defs;
@@ -60,6 +62,9 @@ class SIPostRABundler {
6062
bool isBundleCandidate(const MachineInstr &MI) const;
6163
bool isDependentLoad(const MachineInstr &MI) const;
6264
bool canBundle(const MachineInstr &MI, const MachineInstr &NextMI) const;
65+
void reorderLoads(MachineBasicBlock &MBB,
66+
MachineBasicBlock::instr_iterator &BundleStart,
67+
MachineBasicBlock::instr_iterator Next);
6368
};
6469

6570
constexpr uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
@@ -129,6 +134,141 @@ bool SIPostRABundler::canBundle(const MachineInstr &MI,
129134
!isDependentLoad(NextMI));
130135
}
131136

137+
static Register getDef(MachineInstr &MI) {
138+
assert(MI.getNumExplicitDefs() > 0);
139+
return MI.defs().begin()->getReg();
140+
}
141+
142+
void SIPostRABundler::reorderLoads(
143+
MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &BundleStart,
144+
MachineBasicBlock::instr_iterator Next) {
145+
// Don't reorder ALU, store or scalar clauses.
146+
if (!BundleStart->mayLoad() || BundleStart->mayStore() ||
147+
SIInstrInfo::isSMRD(*BundleStart) || !BundleStart->getNumExplicitDefs())
148+
return;
149+
150+
// Search to find the usage distance of each defined register in the clause.
151+
const unsigned SearchDistance = std::max(Defs.size(), 100UL);
152+
SmallDenseMap<Register, unsigned> UseDistance;
153+
unsigned MaxDistance = 0;
154+
for (MachineBasicBlock::iterator SearchI = Next;
155+
SearchI != MBB.end() && MaxDistance < SearchDistance &&
156+
UseDistance.size() < Defs.size();
157+
++SearchI, ++MaxDistance) {
158+
for (Register Reg : Defs) {
159+
if (UseDistance.contains(Reg))
160+
continue;
161+
if (SearchI->readsRegister(Reg, TRI))
162+
UseDistance[Reg] = MaxDistance;
163+
}
164+
}
165+
166+
if (UseDistance.empty())
167+
return;
168+
169+
LLVM_DEBUG(dbgs() << "Try bundle reordering\n");
170+
171+
// Build schedule based on use distance of register uses.
172+
// Attempt to preserve exist order (NativeOrder) where possible.
173+
std::deque<std::pair<MachineInstr *, unsigned>> Schedule;
174+
unsigned NativeOrder = 0, LastOrder = 0;
175+
bool Reordered = false;
176+
for (auto II = BundleStart; II != Next; ++II, ++NativeOrder) {
177+
// Bail out if we encounter anything that seems risky to reorder.
178+
if (!II->getNumExplicitDefs() || II->isKill() ||
179+
llvm::any_of(II->memoperands(), [&](const MachineMemOperand *MMO) {
180+
return MMO->isAtomic() || MMO->isVolatile();
181+
})) {
182+
LLVM_DEBUG(dbgs() << " Abort\n");
183+
return;
184+
}
185+
186+
Register Reg = getDef(*II);
187+
unsigned NewOrder =
188+
UseDistance.contains(Reg) ? UseDistance[Reg] : MaxDistance;
189+
LLVM_DEBUG(dbgs() << " Order: " << NewOrder << "," << NativeOrder
190+
<< ", MI: " << *II);
191+
unsigned Order = (NewOrder << 16 | NativeOrder);
192+
Schedule.emplace_back(&*II, Order);
193+
Reordered |= Order < LastOrder;
194+
LastOrder = Order;
195+
}
196+
197+
// No reordering found.
198+
if (!Reordered) {
199+
LLVM_DEBUG(dbgs() << " No changes\n");
200+
return;
201+
}
202+
203+
// Apply sort on new ordering.
204+
std::sort(Schedule.begin(), Schedule.end(),
205+
[](std::pair<MachineInstr *, unsigned> A,
206+
std::pair<MachineInstr *, unsigned> B) {
207+
return A.second < B.second;
208+
});
209+
210+
// Rebuild clause order.
211+
// Schedule holds ideal order for the load operations; however, each def
212+
// can only be scheduled when it will no longer clobber any uses.
213+
SmallVector<MachineInstr *> Clause;
214+
while (!Schedule.empty()) {
215+
// Try to schedule next instruction in schedule.
216+
// Iterate until we find something that can be placed.
217+
auto It = Schedule.begin();
218+
while (It != Schedule.end()) {
219+
MachineInstr *MI = It->first;
220+
LLVM_DEBUG(dbgs() << "Try schedule: " << *MI);
221+
222+
if (MI->getNumExplicitDefs() == 0) {
223+
// No defs, always schedule.
224+
LLVM_DEBUG(dbgs() << " Trivially OK\n");
225+
break;
226+
}
227+
228+
Register DefReg = getDef(*MI);
229+
bool DefRegHasUse = false;
230+
for (auto SearchIt = std::next(It);
231+
SearchIt != Schedule.end() && !DefRegHasUse; ++SearchIt)
232+
DefRegHasUse = SearchIt->first->readsRegister(DefReg, TRI);
233+
if (DefRegHasUse) {
234+
// A future use would be clobbered; try next instruction in the
235+
// schedule.
236+
LLVM_DEBUG(dbgs() << " Clobbers uses\n");
237+
It++;
238+
continue;
239+
}
240+
241+
// Safe to schedule.
242+
LLVM_DEBUG(dbgs() << " OK!\n");
243+
break;
244+
}
245+
246+
// Place schedule instruction into clause order.
247+
assert(It != Schedule.end());
248+
MachineInstr *MI = It->first;
249+
Schedule.erase(It);
250+
Clause.push_back(MI);
251+
252+
// Clear kill flags for later uses.
253+
for (auto &Use : MI->all_uses()) {
254+
if (!Use.isReg() || !Use.isKill())
255+
continue;
256+
Register UseReg = Use.getReg();
257+
if (llvm::any_of(Schedule, [&](std::pair<MachineInstr *, unsigned> &SI) {
258+
return SI.first->readsRegister(UseReg, TRI);
259+
}))
260+
Use.setIsKill(false);
261+
}
262+
}
263+
264+
// Apply order to instructions.
265+
for (MachineInstr *MI : Clause)
266+
MI->moveBefore(&*Next);
267+
268+
// Update start of bundle.
269+
BundleStart = Clause[0]->getIterator();
270+
}
271+
132272
bool SIPostRABundlerLegacy::runOnMachineFunction(MachineFunction &MF) {
133273
if (skipFunction(MF.getFunction()))
134274
return false;
@@ -143,6 +283,8 @@ PreservedAnalyses SIPostRABundlerPass::run(MachineFunction &MF,
143283

144284
bool SIPostRABundler::run(MachineFunction &MF) {
145285

286+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
287+
TII = ST.getInstrInfo();
146288
TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
147289
BitVector BundleUsedRegUnits(TRI->getNumRegUnits());
148290
BitVector KillUsedRegUnits(TRI->getNumRegUnits());
@@ -170,7 +312,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
170312
assert(Defs.empty());
171313

172314
if (I->getNumExplicitDefs() != 0)
173-
Defs.insert(I->defs().begin()->getReg());
315+
Defs.insert(getDef(*I));
174316

175317
MachineBasicBlock::instr_iterator BundleStart = I;
176318
MachineBasicBlock::instr_iterator BundleEnd = I;
@@ -182,7 +324,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
182324
if (canBundle(*BundleEnd, *I)) {
183325
BundleEnd = I;
184326
if (I->getNumExplicitDefs() != 0)
185-
Defs.insert(I->defs().begin()->getReg());
327+
Defs.insert(getDef(*I));
186328
++ClauseLength;
187329
} else if (!I->isMetaInstruction() ||
188330
I->getOpcode() == AMDGPU::SCHED_BARRIER) {
@@ -234,6 +376,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
234376
BundleUsedRegUnits.reset();
235377
}
236378

379+
reorderLoads(MBB, BundleStart, Next);
237380
finalizeBundle(MBB, BundleStart, Next);
238381
}
239382

llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -716,17 +716,17 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
716716
; GFX9-LABEL: add_v11i16:
717717
; GFX9: ; %bb.0:
718718
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
719-
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
720719
; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16
721720
; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16
721+
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
722722
; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off
723723
; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:20
724724
; GFX9-NEXT: global_load_ushort v17, v[0:1], off offset:20
725725
; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:18
726726
; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:18
727-
; GFX9-NEXT: s_waitcnt vmcnt(6)
727+
; GFX9-NEXT: s_waitcnt vmcnt(7)
728728
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14
729-
; GFX9-NEXT: s_waitcnt vmcnt(5)
729+
; GFX9-NEXT: s_waitcnt vmcnt(6)
730730
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15
731731
; GFX9-NEXT: s_waitcnt vmcnt(4)
732732
; GFX9-NEXT: v_pk_add_u16 v0, v6, v10

0 commit comments

Comments
 (0)