Skip to content

Commit d57286a

Browse files
committed
[AMDGPU] Dynamically set load latency in the scheduler
1 parent 450737f commit d57286a

File tree

5 files changed

+931
-1
lines changed

5 files changed

+931
-1
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1150,6 +1150,40 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
11501150
GCNSchedStage::finalizeGCNSchedStage();
11511151
}
11521152

1153+
bool ILPInitialScheduleStage::initGCNSchedStage() {
1154+
if (!GCNSchedStage::initGCNSchedStage())
1155+
return false;
1156+
1157+
const SIInstrInfo *TII = ST.getInstrInfo();
1158+
OriginalLoadLatencyScaleFactor = TII->getLoadLatencyScaleFactor();
1159+
OriginalDSReadLatencyScaleFactor = TII->getDSReadLatencyScaleFactor();
1160+
OriginalVMEMLoadLatencyScaleFactor = TII->getVMEMLoadLatencyScaleFactor();
1161+
const unsigned ILPLoadLatencyScaleFactorDefault = 300;
1162+
if (ILPLoadLatencyScaleFactorDefault > TII->getLoadLatencyScaleFactor())
1163+
TII->setLoadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault);
1164+
if (ILPLoadLatencyScaleFactorDefault > TII->getDSReadLatencyScaleFactor())
1165+
TII->setDSReadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault);
1166+
if (ILPLoadLatencyScaleFactorDefault > TII->getVMEMLoadLatencyScaleFactor())
1167+
TII->setVMEMLoadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault);
1168+
1169+
LLVM_DEBUG(dbgs() << "ILP Initial Schedule: Set load latency scale factor to "
1170+
<< TII->getLoadLatencyScaleFactor() << '\n');
1171+
return true;
1172+
}
1173+
1174+
void ILPInitialScheduleStage::finalizeGCNSchedStage() {
1175+
const SIInstrInfo *TII = ST.getInstrInfo();
1176+
TII->setLoadLatencyScaleFactor(OriginalLoadLatencyScaleFactor);
1177+
TII->setDSReadLatencyScaleFactor(OriginalDSReadLatencyScaleFactor);
1178+
TII->setVMEMLoadLatencyScaleFactor(OriginalVMEMLoadLatencyScaleFactor);
1179+
1180+
LLVM_DEBUG(
1181+
dbgs() << "ILP Initial Schedule: Restored load latency scale factor to "
1182+
<< OriginalLoadLatencyScaleFactor << "\n");
1183+
1184+
GCNSchedStage::finalizeGCNSchedStage();
1185+
}
1186+
11531187
bool GCNSchedStage::initGCNRegion() {
11541188
// Check whether this new region is also a new block.
11551189
if (DAG.RegionBegin->getParent() != CurrentMBB)

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -515,8 +515,15 @@ class PreRARematStage : public GCNSchedStage {
515515
};
516516

517517
class ILPInitialScheduleStage : public GCNSchedStage {
518+
private:
519+
unsigned OriginalLoadLatencyScaleFactor = 0;
520+
unsigned OriginalDSReadLatencyScaleFactor = 0;
521+
unsigned OriginalVMEMLoadLatencyScaleFactor = 0;
522+
518523
public:
519524
bool shouldRevertScheduling(unsigned WavesAfter) override;
525+
bool initGCNSchedStage() override;
526+
void finalizeGCNSchedStage() override;
520527

521528
ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
522529
: GCNSchedStage(StageID, DAG) {}

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,29 @@ static cl::opt<bool> Fix16BitCopies(
6262
cl::init(true),
6363
cl::ReallyHidden);
6464

65+
static cl::opt<unsigned> AMDGPULoadLatencyScaleFactor(
66+
"amdgpu-load-latency-scale-factor",
67+
cl::desc("Scale factor for load instruction latency. Final latency is "
68+
"scalled by `Factor / 100 * Latency`."),
69+
cl::init(100), cl::ReallyHidden);
70+
71+
static cl::opt<unsigned> AMDGPUDSReadLatencyScaleFactor(
72+
"amdgpu-ds-read-latency-scale-factor",
73+
cl::desc("Scale factor for LDS (DS) read instruction latency. Final "
74+
"latency is scaled by `Factor / 100 * Latency`."),
75+
cl::init(100), cl::ReallyHidden);
76+
77+
static cl::opt<unsigned> AMDGPUVMEMLoadLatencyScaleFactor(
78+
"amdgpu-vmem-load-latency-scale-factor",
79+
cl::desc("Scale factor for VMEM/BUFFER/FLAT load instruction latency. "
80+
"Final latency is scaled by `Factor / 100 * Latency`."),
81+
cl::init(100), cl::ReallyHidden);
82+
6583
SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
6684
: AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
67-
RI(ST), ST(ST) {
85+
RI(ST), ST(ST), LoadLatencyScaleFactor(AMDGPULoadLatencyScaleFactor),
86+
DSReadLatencyScaleFactor(AMDGPUDSReadLatencyScaleFactor),
87+
VMEMLoadLatencyScaleFactor(AMDGPUVMEMLoadLatencyScaleFactor) {
6888
SchedModel.init(&ST);
6989
}
7090

@@ -10240,6 +10260,43 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
1024010260
return SchedModel.computeInstrLatency(&MI);
1024110261
}
1024210262

10263+
std::optional<unsigned>
10264+
SIInstrInfo::getInstrLatency(const TargetSchedModel &TargetSchedModel,
10265+
const MachineInstr &MI) const {
10266+
auto LatencyOpt = TargetInstrInfo::getInstrLatency(TargetSchedModel, MI);
10267+
if (!LatencyOpt)
10268+
return std::nullopt;
10269+
unsigned Latency = *LatencyOpt;
10270+
if (MI.mayLoad()) {
10271+
unsigned Scale = LoadLatencyScaleFactor;
10272+
if (isDS(MI))
10273+
Scale = DSReadLatencyScaleFactor;
10274+
else if (isVMEM(MI) || isFLAT(MI))
10275+
Scale = VMEMLoadLatencyScaleFactor;
10276+
Latency = (Latency * Scale) / 100;
10277+
}
10278+
return Latency;
10279+
}
10280+
10281+
std::optional<unsigned> SIInstrInfo::getOperandLatency(
10282+
const TargetSchedModel &SchedModel, const MachineInstr *DefMI,
10283+
unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const {
10284+
auto LatOpt = TargetInstrInfo::getOperandLatency(
10285+
SchedModel, DefMI, DefOperIdx, UseMI, UseOperIdx);
10286+
if (!LatOpt)
10287+
return std::nullopt;
10288+
unsigned Latency = *LatOpt;
10289+
if (DefMI && DefMI->mayLoad()) {
10290+
unsigned Scale = LoadLatencyScaleFactor;
10291+
if (isDS(*DefMI))
10292+
Scale = DSReadLatencyScaleFactor;
10293+
else if (isVMEM(*DefMI) || isFLAT(*DefMI))
10294+
Scale = VMEMLoadLatencyScaleFactor;
10295+
Latency = (Latency * Scale) / 100;
10296+
}
10297+
return Latency;
10298+
}
10299+
1024310300
InstructionUniformity
1024410301
SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
1024510302
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
9393
const GCNSubtarget &ST;
9494
TargetSchedModel SchedModel;
9595
mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter;
96+
// Final load latency in the machine model is scalled by
97+
// `Factor / 100 * Latency`
98+
mutable unsigned LoadLatencyScaleFactor = 100;
99+
// Separate scale factor for LDS (DS) read operations.
100+
mutable unsigned DSReadLatencyScaleFactor = 100;
101+
// Separate scale factor for VMEM/BUFFER/FLAT loads.
102+
mutable unsigned VMEMLoadLatencyScaleFactor = 100;
96103

97104
// The inverse predicate should have the negative value.
98105
enum BranchPredicate {
@@ -111,6 +118,38 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
111118
static BranchPredicate getBranchPredicate(unsigned Opcode);
112119

113120
public:
121+
void setLoadLatencyScaleFactor(unsigned Factor) const {
122+
LoadLatencyScaleFactor = Factor;
123+
}
124+
125+
unsigned getLoadLatencyScaleFactor() const { return LoadLatencyScaleFactor; }
126+
127+
// Control DS read (LDS) latency scaling independently when desired.
128+
void setDSReadLatencyScaleFactor(unsigned Factor) const {
129+
DSReadLatencyScaleFactor = Factor;
130+
}
131+
unsigned getDSReadLatencyScaleFactor() const {
132+
return DSReadLatencyScaleFactor;
133+
}
134+
135+
// Control VMEM/BUFFER/FLAT load latency scaling independently.
136+
void setVMEMLoadLatencyScaleFactor(unsigned Factor) const {
137+
VMEMLoadLatencyScaleFactor = Factor;
138+
}
139+
unsigned getVMEMLoadLatencyScaleFactor() const {
140+
return VMEMLoadLatencyScaleFactor;
141+
}
142+
143+
// TargetSchedModel latency hooks.
144+
std::optional<unsigned>
145+
getInstrLatency(const TargetSchedModel &TargetSchedModel,
146+
const MachineInstr &MI) const override;
147+
std::optional<unsigned> getOperandLatency(const TargetSchedModel &SchedModel,
148+
const MachineInstr *DefMI,
149+
unsigned DefIdx,
150+
const MachineInstr *UseMI,
151+
unsigned UseIdx) const override;
152+
114153
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
115154
MachineRegisterInfo &MRI,
116155
const MachineOperand &SuperReg,

0 commit comments

Comments
 (0)