Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1150,6 +1150,40 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
GCNSchedStage::finalizeGCNSchedStage();
}

bool ILPInitialScheduleStage::initGCNSchedStage() {
if (!GCNSchedStage::initGCNSchedStage())
return false;

const SIInstrInfo *TII = ST.getInstrInfo();
OriginalLoadLatencyScaleFactor = TII->getLoadLatencyScaleFactor();
OriginalDSReadLatencyScaleFactor = TII->getDSReadLatencyScaleFactor();
OriginalVMEMLoadLatencyScaleFactor = TII->getVMEMLoadLatencyScaleFactor();
const unsigned ILPLoadLatencyScaleFactorDefault = 300;
if (ILPLoadLatencyScaleFactorDefault > TII->getLoadLatencyScaleFactor())
TII->setLoadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault);
if (ILPLoadLatencyScaleFactorDefault > TII->getDSReadLatencyScaleFactor())
TII->setDSReadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault);
if (ILPLoadLatencyScaleFactorDefault > TII->getVMEMLoadLatencyScaleFactor())
TII->setVMEMLoadLatencyScaleFactor(ILPLoadLatencyScaleFactorDefault);

LLVM_DEBUG(dbgs() << "ILP Initial Schedule: Set load latency scale factor to "
<< TII->getLoadLatencyScaleFactor() << '\n');
return true;
}

void ILPInitialScheduleStage::finalizeGCNSchedStage() {
const SIInstrInfo *TII = ST.getInstrInfo();
TII->setLoadLatencyScaleFactor(OriginalLoadLatencyScaleFactor);
TII->setDSReadLatencyScaleFactor(OriginalDSReadLatencyScaleFactor);
TII->setVMEMLoadLatencyScaleFactor(OriginalVMEMLoadLatencyScaleFactor);

LLVM_DEBUG(
dbgs() << "ILP Initial Schedule: Restored load latency scale factor to "
<< OriginalLoadLatencyScaleFactor << "\n");

GCNSchedStage::finalizeGCNSchedStage();
}

bool GCNSchedStage::initGCNRegion() {
// Check whether this new region is also a new block.
if (DAG.RegionBegin->getParent() != CurrentMBB)
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -515,8 +515,15 @@ class PreRARematStage : public GCNSchedStage {
};

class ILPInitialScheduleStage : public GCNSchedStage {
private:
unsigned OriginalLoadLatencyScaleFactor = 0;
unsigned OriginalDSReadLatencyScaleFactor = 0;
unsigned OriginalVMEMLoadLatencyScaleFactor = 0;

public:
bool shouldRevertScheduling(unsigned WavesAfter) override;
bool initGCNSchedStage() override;
void finalizeGCNSchedStage() override;

ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: GCNSchedStage(StageID, DAG) {}
Expand Down
59 changes: 58 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,29 @@ static cl::opt<bool> Fix16BitCopies(
cl::init(true),
cl::ReallyHidden);

static cl::opt<unsigned> AMDGPULoadLatencyScaleFactor(
"amdgpu-load-latency-scale-factor",
cl::desc("Scale factor for load instruction latency. Final latency is "
"scalled by `Factor / 100 * Latency`."),
cl::init(100), cl::ReallyHidden);

static cl::opt<unsigned> AMDGPUDSReadLatencyScaleFactor(
"amdgpu-ds-read-latency-scale-factor",
cl::desc("Scale factor for LDS (DS) read instruction latency. Final "
"latency is scaled by `Factor / 100 * Latency`."),
cl::init(100), cl::ReallyHidden);

static cl::opt<unsigned> AMDGPUVMEMLoadLatencyScaleFactor(
"amdgpu-vmem-load-latency-scale-factor",
cl::desc("Scale factor for VMEM/BUFFER/FLAT load instruction latency. "
"Final latency is scaled by `Factor / 100 * Latency`."),
cl::init(100), cl::ReallyHidden);

SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
: AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
RI(ST), ST(ST) {
RI(ST), ST(ST), LoadLatencyScaleFactor(AMDGPULoadLatencyScaleFactor),
DSReadLatencyScaleFactor(AMDGPUDSReadLatencyScaleFactor),
VMEMLoadLatencyScaleFactor(AMDGPUVMEMLoadLatencyScaleFactor) {
SchedModel.init(&ST);
}

Expand Down Expand Up @@ -10240,6 +10260,43 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
return SchedModel.computeInstrLatency(&MI);
}

std::optional<unsigned>
SIInstrInfo::getInstrLatency(const TargetSchedModel &TargetSchedModel,
const MachineInstr &MI) const {
auto LatencyOpt = TargetInstrInfo::getInstrLatency(TargetSchedModel, MI);
if (!LatencyOpt)
return std::nullopt;
unsigned Latency = *LatencyOpt;
if (MI.mayLoad()) {
unsigned Scale = LoadLatencyScaleFactor;
if (isDS(MI))
Scale = DSReadLatencyScaleFactor;
else if (isVMEM(MI) || isFLAT(MI))
Scale = VMEMLoadLatencyScaleFactor;
Latency = (Latency * Scale) / 100;
}
return Latency;
}

std::optional<unsigned> SIInstrInfo::getOperandLatency(
const TargetSchedModel &SchedModel, const MachineInstr *DefMI,
unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const {
auto LatOpt = TargetInstrInfo::getOperandLatency(
SchedModel, DefMI, DefOperIdx, UseMI, UseOperIdx);
if (!LatOpt)
return std::nullopt;
unsigned Latency = *LatOpt;
if (DefMI && DefMI->mayLoad()) {
unsigned Scale = LoadLatencyScaleFactor;
if (isDS(*DefMI))
Scale = DSReadLatencyScaleFactor;
else if (isVMEM(*DefMI) || isFLAT(*DefMI))
Scale = VMEMLoadLatencyScaleFactor;
Latency = (Latency * Scale) / 100;
}
return Latency;
}

InstructionUniformity
SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
Expand Down
39 changes: 39 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
const GCNSubtarget &ST;
TargetSchedModel SchedModel;
mutable std::unique_ptr<AMDGPUMIRFormatter> Formatter;
// Final load latency in the machine model is scalled by
// `Factor / 100 * Latency`
mutable unsigned LoadLatencyScaleFactor = 100;
// Separate scale factor for LDS (DS) read operations.
mutable unsigned DSReadLatencyScaleFactor = 100;
// Separate scale factor for VMEM/BUFFER/FLAT loads.
mutable unsigned VMEMLoadLatencyScaleFactor = 100;

// The inverse predicate should have the negative value.
enum BranchPredicate {
Expand All @@ -111,6 +118,38 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
static BranchPredicate getBranchPredicate(unsigned Opcode);

public:
void setLoadLatencyScaleFactor(unsigned Factor) const {
LoadLatencyScaleFactor = Factor;
}

unsigned getLoadLatencyScaleFactor() const { return LoadLatencyScaleFactor; }

// Control DS read (LDS) latency scaling independently when desired.
void setDSReadLatencyScaleFactor(unsigned Factor) const {
DSReadLatencyScaleFactor = Factor;
}
unsigned getDSReadLatencyScaleFactor() const {
return DSReadLatencyScaleFactor;
}

// Control VMEM/BUFFER/FLAT load latency scaling independently.
void setVMEMLoadLatencyScaleFactor(unsigned Factor) const {
VMEMLoadLatencyScaleFactor = Factor;
}
unsigned getVMEMLoadLatencyScaleFactor() const {
return VMEMLoadLatencyScaleFactor;
}

// TargetSchedModel latency hooks.
std::optional<unsigned>
getInstrLatency(const TargetSchedModel &TargetSchedModel,
const MachineInstr &MI) const override;
std::optional<unsigned> getOperandLatency(const TargetSchedModel &SchedModel,
const MachineInstr *DefMI,
unsigned DefIdx,
const MachineInstr *UseMI,
unsigned UseIdx) const override;

unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
MachineRegisterInfo &MRI,
const MachineOperand &SuperReg,
Expand Down
Loading
Loading