Skip to content

Commit d4c4f79

Browse files
committed
[AMDGPU] Dynamically set load latency in the scheduler
1 parent fe6782d commit d4c4f79

File tree

4 files changed

+60
-2
lines changed

4 files changed

+60
-2
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,6 +1130,32 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
11301130
GCNSchedStage::finalizeGCNSchedStage();
11311131
}
11321132

1133+
bool ILPInitialScheduleStage::initGCNSchedStage() {
1134+
if (!GCNSchedStage::initGCNSchedStage())
1135+
return false;
1136+
1137+
const SIInstrInfo *TII = ST.getInstrInfo();
1138+
OriginalLoadLatencyScaleFactor = TII->getLoadLatencyScaleFactor();
1139+
const unsigned ILPLoadLatencyScaleFactorDefault = 300;
1140+
if (ILPLoadLatencyScaleFactorDefault > TII->getLoadLatencyScaleFactor())
1141+
TII->setLoadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault);
1142+
1143+
LLVM_DEBUG(dbgs() << "ILP Initial Schedule: Set load latency scale factor to "
1144+
<< TII->getLoadLatencyScaleFactor() << '\n');
1145+
return true;
1146+
}
1147+
1148+
void ILPInitialScheduleStage::finalizeGCNSchedStage() {
1149+
const SIInstrInfo *TII = ST.getInstrInfo();
1150+
TII->setLoadLatencyScaleFactor(OriginalLoadLatencyScaleFactor);
1151+
1152+
LLVM_DEBUG(
1153+
dbgs() << "ILP Initial Schedule: Restored load latency scale factor to "
1154+
<< OriginalLoadLatencyScaleFactor << "\n");
1155+
1156+
GCNSchedStage::finalizeGCNSchedStage();
1157+
}
1158+
11331159
bool GCNSchedStage::initGCNRegion() {
11341160
// Check whether this new region is also a new block.
11351161
if (DAG.RegionBegin->getParent() != CurrentMBB)

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,8 +474,13 @@ class PreRARematStage : public GCNSchedStage {
474474
};
475475

476476
class ILPInitialScheduleStage : public GCNSchedStage {
477+
private:
478+
unsigned OriginalLoadLatencyScaleFactor = 0;
479+
477480
public:
478481
bool shouldRevertScheduling(unsigned WavesAfter) override;
482+
bool initGCNSchedStage() override;
483+
void finalizeGCNSchedStage() override;
479484

480485
ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
481486
: GCNSchedStage(StageID, DAG) {}

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,15 @@ static cl::opt<bool> Fix16BitCopies(
6060
cl::init(true),
6161
cl::ReallyHidden);
6262

63+
static cl::opt<unsigned> AMDGPULoadLatencyScaleFactor(
64+
"amdgpu-load-latency-scale-factor",
65+
cl::desc("Scale factor for load instruction latency. Final latency is "
66+
"scalled by `Factor / 100 * Latency`."),
67+
cl::init(100), cl::ReallyHidden);
68+
6369
SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
64-
: AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
65-
RI(ST), ST(ST) {
70+
: AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
71+
RI(ST), ST(ST), LoadLatencyScaleFactor(AMDGPULoadLatencyScaleFactor) {
6672
SchedModel.init(&ST);
6773
}
6874

@@ -9792,6 +9798,15 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
97929798
return SchedModel.computeInstrLatency(&MI);
97939799
}
97949800

9801+
unsigned SIInstrInfo::getInstrLatency(const TargetSchedModel &TargetSchedModel,
9802+
const MachineInstr &MI) const {
9803+
unsigned Latency = TargetInstrInfo::getInstrLatency(TargetSchedModel, MI);
9804+
if (MI.mayLoad())
9805+
Latency *= LoadLatencyScaleFactor / 100;
9806+
9807+
return Latency;
9808+
}
9809+
97959810
InstructionUniformity
97969811
SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
97979812
unsigned opcode = MI.getOpcode();

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
8888
const GCNSubtarget &ST;
8989
TargetSchedModel SchedModel;
9090
mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter;
91+
// Final load latency in the machine model is scalled by
92+
// `Factor / 100 * Latency`
93+
mutable unsigned LoadLatencyScaleFactor = 100;
9194

9295
// The inverse predicate should have the negative value.
9396
enum BranchPredicate {
@@ -106,6 +109,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
106109
static BranchPredicate getBranchPredicate(unsigned Opcode);
107110

108111
public:
112+
void setLoadLatencyScaleFactor(unsigned Factor) const {
113+
LoadLatencyScaleFactor = Factor;
114+
}
115+
116+
unsigned getLoadLatencyScaleFactor() const { return LoadLatencyScaleFactor; }
117+
109118
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
110119
MachineRegisterInfo &MRI,
111120
const MachineOperand &SuperReg,
@@ -1462,6 +1471,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
14621471
const MachineInstr &MI,
14631472
unsigned *PredCost = nullptr) const override;
14641473

1474+
unsigned getInstrLatency(const TargetSchedModel &TargetSchedModel,
1475+
const MachineInstr &MI) const override;
1476+
14651477
InstructionUniformity
14661478
getInstructionUniformity(const MachineInstr &MI) const override final;
14671479

0 commit comments

Comments
 (0)