Skip to content

Commit c871117

Browse files
committed
Address more feedback
- Use SchedModel instead of instruction itinerary - Don't use getNumConvertedRegs to get number of regs, use RC instead. - Clarify some comments. - Other minor changes.
1 parent 8d2fa9c commit c871117

File tree

2 files changed

+88
-45
lines changed

2 files changed

+88
-45
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 58 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,7 +1225,7 @@ bool PreRARematStage::initGCNSchedStage() {
12251225
<< "] REMAT (always) | " << *Remat.DefMI);
12261226
rematerialize(Remat, RecomputeRP);
12271227
} else {
1228-
ScoredRemats.emplace_back(&Remat, DAG.ST, *DAG.TII);
1228+
ScoredRemats.emplace_back(&Remat, DAG);
12291229
}
12301230
}
12311231
unsetSatisifedRPTargets(RescheduleRegions);
@@ -1286,9 +1286,12 @@ bool PreRARematStage::initGCNSchedStage() {
12861286
REMAT_DEBUG(dbgs() << "[" << MIRegion[Remat.DefMI] << "] REMAT *" << Score
12871287
<< "* | " << *Remat.DefMI);
12881288
MachineInstr *RematMI = rematerialize(Remat, RecomputeRP);
1289-
// Every rematerialization done with the objective of increasing occupancy
1290-
// increases latency. If we don't manage to increase occupancy, we want to
1291-
// roll them back.
1289+
// Every rematerialization we do here is likely to move the instruction
1290+
// into a higher frequency region, increasing the total sum latency of the
1291+
// instruction itself. This is acceptable if we are eliminating a spill in
1292+
// the process, but when the goal is increasing occupancy we get nothing
1293+
// out of rematerialization if occupancy is not increased in the end; in
1294+
// such cases we want to roll back the rematerialization.
12921295
if (TargetOcc)
12931296
Rollbackable.push_back({RematMI, &Remat});
12941297
unsetSatisifedRPTargets(Remat.Live);
@@ -1315,7 +1318,7 @@ bool PreRARematStage::initGCNSchedStage() {
13151318
DAG.Pressure[I] = RP;
13161319
unsigned NewRegionOcc = RP.getOccupancy(ST, DynamicVGPRBlockSize);
13171320
AchievedOcc = std::min(AchievedOcc, NewRegionOcc);
1318-
REMAT_DEBUG(dbgs() << "[" << I << "] Achieved occupancy " << NewRegionOcc
1321+
REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy " << NewRegionOcc
13191322
<< " (" << RPTargets[I] << ")\n");
13201323
}
13211324

@@ -1878,9 +1881,9 @@ bool PreRARematStage::collectRematRegs(ArrayRef<uint64_t> RegionFreq) {
18781881
// regions containing rematerializable instructions.
18791882
DAG.RegionLiveOuts.buildLiveRegMap();
18801883

1881-
// Set of registers already marked for potential remterialization; used for
1882-
// remat chains checks.
1883-
DenseSet<Register> RematRegSet;
1884+
// Set of registers already marked for potential remterialization; used to
1885+
// avoid rematerialization chains.
1886+
SmallSet<Register, 4> RematRegSet;
18841887
auto IsMORematable = [&RematRegSet](const MachineOperand &MO) -> bool {
18851888
return MO.isReg() && RematRegSet.contains(MO.getReg());
18861889
};
@@ -1979,15 +1982,45 @@ PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
19791982
return NewMI;
19801983
}
19811984

1985+
unsigned PreRARematStage::ScoredRemat::getNumRegs(
1986+
const GCNScheduleDAGMILive &DAG) const {
1987+
// FIXME: this doesn't account for the fact that the rematerialization may be
1988+
// for a subregister. In that case we will overestimate the number of
1989+
// registers involved. This is acceptable since this is purely used for the
1990+
// scoring heuristic, but we should find a way to compute the number of
1991+
// registers actually covered by the register/subregister pair.
1992+
Register Reg = Remat->DefMI->getOperand(0).getReg();
1993+
const TargetRegisterClass &RC = *DAG.MRI.getRegClass(Reg);
1994+
return divideCeil(DAG.TRI->getRegSizeInBits(RC), 32);
1995+
}
1996+
1997+
unsigned PreRARematStage::ScoredRemat::getLatencyGain(
1998+
const GCNScheduleDAGMILive &DAG) const {
1999+
if (hasUnknownLatencyGain())
2000+
return 0;
2001+
2002+
const TargetSchedModel &SchedModel = DAG.ST.getInstrInfo()->getSchedModel();
2003+
2004+
// Rematerializing the register to its using region changes the number of
2005+
// times we will execute it in total.
2006+
unsigned FreqDiff = Remat->UseFrequency - Remat->DefFrequency;
2007+
int RematLatDiff = FreqDiff * SchedModel.computeInstrLatency(Remat->DefMI);
2008+
2009+
// We assume that spilling the register means we have to insert a save in its
2010+
// defining region and a restore in its using region. Spill instruction
2011+
// opcodes do not have corresponding scheduling models so we cannot accurately
2012+
// estimate their latency. Since this is just meant as a heuristic, use the
2013+
// default high latency from the MC scheduling model.
2014+
int SpillLatDiff = SchedModel.getMCSchedModel()->DefaultHighLatency *
2015+
(Remat->DefFrequency + Remat->UseFrequency);
2016+
2017+
return SpillLatDiff - RematLatDiff;
2018+
}
2019+
19822020
PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
1983-
const GCNSubtarget &ST,
1984-
const TargetInstrInfo &TII)
1985-
: Remat(Remat) {
1986-
const InstrItineraryData *Itin = ST.getInstrItineraryData();
1987-
if (Remat->DefFrequency && Remat->UseFrequency) {
1988-
InstrLatencyGain = Remat->DefFrequency - Remat->UseFrequency;
1989-
*InstrLatencyGain *= TII.getInstrLatency(Itin, *Remat->DefMI);
1990-
}
2021+
const GCNScheduleDAGMILive &DAG)
2022+
: Remat(Remat), NumRegs(getNumRegs(DAG)),
2023+
RematLatencyGainOverSpill(getLatencyGain(DAG)) {
19912024
resetScore();
19922025
}
19932026

@@ -2011,20 +2044,13 @@ void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
20112044
// we get by increasing occupancy and compare it to the latency hit each wave
20122045
// will be subjected to.
20132046
if (ReduceSpill) {
2014-
// It may be better to let the register spill if it is defined by a very
2015-
// high latency instruction. Try to estimate the latency gain induced by
2016-
// rematerializing the register.
2017-
//
2018-
// If we don't know the rematerializations's latency gain we don't know
2019-
// what to compare the spill latency against. We still consider the
2020-
// rematerialization potentially beneficial in such cases because we don't
2021-
// want to miss rematerialization opportunities and rematerializing is in
2022-
// most cases cheaper than spilling. We still give a bonus to remats for
2023-
// which we are able to do the calculation.
2024-
if (InstrLatencyGain && *InstrLatencyGain < 0) {
2025-
int SpillLatencyGain = SaveCost * Remat->DefFrequency;
2026-
SpillLatencyGain += RestoreCost * Remat->UseFrequency;
2027-
if (*InstrLatencyGain + SpillLatencyGain < 0)
2047+
// If we don't know the latency gain, we still consider the
2048+
// rematerialization potentially beneficial because we don't want to miss
2049+
// rematerialization opportunities and rematerializing is in most cases
2050+
// cheaper than spilling. We still give a bonus to remats for which we are
2051+
// able to do the calculation.
2052+
if (!hasUnknownLatencyGain()) {
2053+
if (RematLatencyGainOverSpill < 0)
20282054
return setUselessRemat();
20292055
setKnownLatencyGain();
20302056
}
@@ -2033,7 +2059,7 @@ void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
20332059
// The estimated RP reduction is proportional to the total frequency in target
20342060
// regions where the register is live.
20352061
Register Reg = Remat->DefMI->getOperand(0).getReg();
2036-
unsigned RPScore = 0;
2062+
ScoreTy RPScore = 0;
20372063
for (unsigned I : TargetRegions.set_bits()) {
20382064
unsigned Freq = std::max(RegionFreq[I], static_cast<uint64_t>(1));
20392065
if (Remat->isBeneficialRegion(I))
@@ -2044,7 +2070,7 @@ void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
20442070

20452071
// The estimated RP reduction is directly proportional to the size of the
20462072
// rematerializable register.
2047-
setRPScore(RPScore * SIRegisterInfo::getNumCoveredRegs(Remat->Mask));
2073+
setRPScore(RPScore * NumRegs);
20482074
}
20492075

20502076
MachineInstr *PreRARematStage::rematerialize(const RematReg &Remat,

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,19 @@ class ClusteredLowOccStage : public GCNSchedStage {
441441
/// reducing spilling or increasing occupancy is possible, it tries to
442442
/// rematerialize as few registers as possible to reduce potential negative
443443
/// effects on function latency.
444+
///
445+
/// The stage only supports rematerializing registers that meet all of the
446+
/// following constraints.
447+
/// 1. The register is virtual and has a single defining instruction.
448+
/// 2. The single defining instruction is either deemed rematerializable by the
449+
/// target-independent logic, or if not, has no non-constant and
450+
/// non-ignorable physical register use.
451+
/// 3 The register has no virtual register use whose live range would be
452+
/// extended by the rematerialization.
453+
/// 4. The register has a single non-debug user in a different region from its
454+
/// defining region.
455+
/// 5. The register is not used by or using another register that is going to be
456+
/// rematerialized.
444457
class PreRARematStage : public GCNSchedStage {
445458
private:
446459
/// Groups information about a rematerializable register.
@@ -520,8 +533,7 @@ class PreRARematStage : public GCNSchedStage {
520533

521534
/// This only initializes state-independent characteristics of \p Remat, not
522535
/// the actual score.
523-
ScoredRemat(const RematReg *Remat, const GCNSubtarget &ST,
524-
const TargetInstrInfo &TII);
536+
ScoredRemat(const RematReg *Remat, const GCNScheduleDAGMILive &DAG);
525537

526538
/// Updates the rematerialization's score w.r.t. the current \p RPTargets.
527539
/// \p RegionFreq indicates the frequency of each region
@@ -540,19 +552,22 @@ class PreRARematStage : public GCNSchedStage {
540552
}
541553

542554
private:
543-
/// Estimated save/restore latency costs for spilling a register to stack.
544-
/// FIXME: These numbers are very arbitrary. Need a good rationale for them,
545-
/// which I don't know where to get from.
546-
static constexpr int SaveCost = 100, RestoreCost = 100;
547555
/// Per-region contribution weights to RP score depending on whether RP is
548556
/// guaranteed or only likely to be reduced in the region. Only their
549557
/// relative value w.r.t. one another matter.
550558
static constexpr int WeightRP = 10, WeightRPMaybe = 5;
551559

552-
/// Latency gain induced by rematerializing the instruction. Does not
553-
/// include estimated spilling cost of *not* rematerializing (save/restore
554-
/// to/from stack).
555-
std::optional<int> InstrLatencyGain = std::nullopt;
560+
/// Number of 32-bit registers this rematerialization covers.
561+
const unsigned NumRegs;
562+
/// Latency gain induced by rematerializing the register over spilling its
563+
/// defining instruction.
564+
const int RematLatencyGainOverSpill;
565+
566+
/// Whether we can estimate the latency gain of rematerialazing over
567+
/// spilling; this requires knowing defining/using region frequencies.
568+
bool hasUnknownLatencyGain() const {
569+
return !Remat->DefFrequency || !Remat->UseFrequency;
570+
}
556571

557572
using ScoreTy = int32_t;
558573
/// Overall rematerialization score. Scoring components are mapped to bit
@@ -568,9 +583,11 @@ class PreRARematStage : public GCNSchedStage {
568583

569584
void setKnownLatencyGain() { Score |= 1; }
570585

571-
void setRPScore(unsigned RPScore) {
572-
Score |= static_cast<ScoreTy>(RPScore) << 1;
573-
}
586+
void setRPScore(ScoreTy RPScore) { Score |= RPScore << 1; }
587+
588+
unsigned getNumRegs(const GCNScheduleDAGMILive &DAG) const;
589+
590+
unsigned getLatencyGain(const GCNScheduleDAGMILive &DAG) const;
574591
};
575592

576593
/// Maps all MIs (except lone terminators, which are not part of any region)

0 commit comments

Comments
 (0)