Skip to content

Commit fd86daf

Browse files
committed
Simplify score calculation and improve debug
- Removed some debug-only functions from the header and put them as close as possible to where they are needed. - Split bitpacked score into separate components compared 1-to-1.
1 parent 4edde52 commit fd86daf

File tree

2 files changed

+116
-180
lines changed

2 files changed

+116
-180
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 79 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,46 +1102,10 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
11021102
#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
11031103

11041104
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1105-
void PreRARematStage::printTargetRegions(bool PrintAll) const {
1106-
if (PrintAll) {
1107-
for (auto [I, Target] : enumerate(RPTargets))
1108-
dbgs() << REMAT_PREFIX << " [" << I << "] " << Target << '\n';
1109-
return;
1110-
}
1111-
if (TargetRegions.none()) {
1112-
dbgs() << REMAT_PREFIX << "No target regions\n";
1113-
return;
1114-
}
1115-
dbgs() << REMAT_PREFIX << "Target regions:\n";
1116-
for (unsigned I : TargetRegions.set_bits())
1117-
dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1118-
}
1119-
1120-
void PreRARematStage::RematReg::print() const {
1121-
dbgs() << REMAT_PREFIX << " [" << DefRegion << "] " << *DefMI;
1122-
dbgs() << REMAT_PREFIX << " -> used in [" << UseRegion << "] " << *UseMI;
1123-
dbgs() << REMAT_PREFIX << " Guaranteed RP reduction in:";
1124-
for (unsigned I : Live.set_bits()) {
1125-
if (isUnusedLiveThrough(I))
1126-
dbgs() << " [" << I << "]";
1127-
}
1128-
dbgs() << '\n';
1129-
dbgs() << REMAT_PREFIX << " Possible RP reduction in:";
1130-
for (unsigned I : Live.set_bits()) {
1131-
if (!isUnusedLiveThrough(I))
1132-
dbgs() << " [" << I << "]";
1133-
}
1134-
dbgs() << '\n';
1135-
}
1136-
1137-
void PreRARematStage::ScoredRemat::print() const {
1138-
ScoreTy ShiftScore = Score;
1139-
ScoreTy RegionImpact = ShiftScore & ((1 << RegionImpactWidth) - 1);
1140-
ShiftScore >>= RegionImpactWidth;
1141-
ScoreTy FreqDiff = ShiftScore & ((1 << FreqDiffWidth) - 1);
1142-
ShiftScore >>= FreqDiffWidth;
1143-
ScoreTy MaxFreq = ShiftScore;
1144-
dbgs() << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1105+
Printable PreRARematStage::ScoredRemat::print() const {
1106+
return Printable([&](raw_ostream &OS) {
1107+
OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1108+
});
11451109
}
11461110
#endif
11471111

@@ -1172,6 +1136,38 @@ bool PreRARematStage::initGCNSchedStage() {
11721136
RegionBB.push_back(ParentMBB);
11731137
}
11741138

1139+
#ifndef NDEBUG
1140+
auto PrintTargetRegions = [&]() -> void {
1141+
if (TargetRegions.none()) {
1142+
dbgs() << REMAT_PREFIX << "No target regions\n";
1143+
return;
1144+
}
1145+
dbgs() << REMAT_PREFIX << "Target regions:\n";
1146+
for (unsigned I : TargetRegions.set_bits())
1147+
dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1148+
};
1149+
auto PrintRematReg = [&](const RematReg &Remat) -> Printable {
1150+
return Printable([&, Remat](raw_ostream &OS) {
1151+
// Concatenate all region numbers in which the register is unused and
1152+
// live-through.
1153+
std::string UnusedLTRegions;
1154+
for (unsigned I = 0; I < NumRegions; ++I) {
1155+
if (Remat.isUnusedLiveThrough(I)) {
1156+
if (!UnusedLTRegions.empty())
1157+
UnusedLTRegions += ",";
1158+
UnusedLTRegions += std::to_string(I);
1159+
}
1160+
}
1161+
if (!UnusedLTRegions.empty())
1162+
UnusedLTRegions = "- " + UnusedLTRegions + " -";
1163+
OS << "[" << Remat.DefRegion << " -" << UnusedLTRegions << "> "
1164+
<< Remat.UseRegion << "] ";
1165+
Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
1166+
/*SkipDebugLoc=*/false, /*AddNewLine=*/false);
1167+
});
1168+
};
1169+
#endif
1170+
11751171
// Set an objective for the stage based on current RP in each region.
11761172
REMAT_DEBUG({
11771173
dbgs() << "Analyzing ";
@@ -1190,22 +1186,19 @@ bool PreRARematStage::initGCNSchedStage() {
11901186
dbgs() << "reduce spilling (minimum target occupancy is "
11911187
<< MFI.getMinWavesPerEU() << ")\n";
11921188
}
1193-
printTargetRegions(/*PrintAll=*/TargetRegions.none());
1189+
PrintTargetRegions();
11941190
});
11951191

11961192
if (!collectRematRegs(MIRegion)) {
11971193
REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
11981194
return false;
11991195
}
1196+
const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
12001197
REMAT_DEBUG({
12011198
dbgs() << "Rematerializable registers:\n";
12021199
for (const RematReg &Remat : RematRegs)
1203-
Remat.print();
1204-
});
1205-
1206-
const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1207-
REMAT_DEBUG({
1208-
dbgs() << "Region frequencies\n";
1200+
dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << '\n';
1201+
dbgs() << REMAT_PREFIX << "Region frequencies\n";
12091202
for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
12101203
dbgs() << REMAT_PREFIX << " [" << I << "] ";
12111204
if (Freq)
@@ -1227,52 +1220,53 @@ bool PreRARematStage::initGCNSchedStage() {
12271220
#endif
12281221
BitVector RecomputeRP(NumRegions);
12291222
do {
1223+
assert(!ScoredRemats.empty() && "no more remat candidates");
1224+
12301225
// (Re-)Score and (re-)sort all remats in increasing score order.
12311226
for (ScoredRemat &Remat : ScoredRemats)
12321227
Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
12331228
sort(ScoredRemats);
12341229

12351230
REMAT_DEBUG({
1236-
dbgs() << "==== ROUND " << RoundNum << " ====\n";
1237-
for (const ScoredRemat &SRemat : ScoredRemats) {
1238-
dbgs() << REMAT_PREFIX;
1239-
SRemat.print();
1240-
dbgs() << " | " << *SRemat.Remat->DefMI;
1231+
dbgs() << "==== ROUND " << RoundNum << " ====\n"
1232+
<< REMAT_PREFIX
1233+
<< "Candidates with non-null score, in rematerialization order:\n";
1234+
for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) {
1235+
if (RematDecision.hasNullScore())
1236+
break;
1237+
dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " | "
1238+
<< *RematDecision.Remat->DefMI;
12411239
}
1242-
printTargetRegions();
1240+
PrintTargetRegions();
12431241
});
12441242

12451243
RecomputeRP.reset();
1246-
int RematIdx = ScoredRemats.size() - 1;
1244+
unsigned RematIdx = ScoredRemats.size();
12471245

12481246
// Rematerialize registers in decreasing score order until we estimate
12491247
// that all RP targets are satisfied or until rematerialization candidates
12501248
// are no longer useful to decrease RP.
1251-
for (; RematIdx >= 0 && TargetRegions.any(); --RematIdx) {
1252-
const RematReg &Remat = *ScoredRemats[RematIdx].Remat;
1253-
// Stop on null score. Since scores monotonically decrease as we
1254-
// rematerialize, we know there is nothing useful left to do in such
1255-
// cases.
1256-
if (ScoredRemats[RematIdx].hasNullScore()) {
1257-
REMAT_DEBUG(dbgs() << "*** Stop on null score | " << *Remat.DefMI);
1258-
RematIdx = -1;
1249+
for (; RematIdx && TargetRegions.any(); --RematIdx) {
1250+
const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1];
1251+
// Stop rematerializing on encountering a null score. Since scores
1252+
// monotonically decrease as we rematerialize, we know there is nothing
1253+
// useful left to do in such cases, even if we were to re-score.
1254+
if (Candidate.hasNullScore()) {
1255+
RematIdx = 0;
12591256
break;
12601257
}
12611258

1259+
const RematReg &Remat = *Candidate.Remat;
12621260
// When previous rematerializations in this round have already satisfied
12631261
// RP targets in all regions this rematerialization can impact, we have a
12641262
// good indication that our scores have diverged significantly from
12651263
// reality, in which case we interrupt this round and re-score. This also
12661264
// ensures that every rematerialization we perform is possibly impactful
12671265
// in at least one target region.
1268-
if (!Remat.maybeBeneficial(TargetRegions, RPTargets)) {
1269-
REMAT_DEBUG(dbgs() << "*** Stop round on stale score | "
1270-
<< *Remat.DefMI);
1266+
if (!Remat.maybeBeneficial(TargetRegions, RPTargets))
12711267
break;
1272-
}
12731268

1274-
REMAT_DEBUG(dbgs() << "*** REMAT [" << Remat.DefRegion << " -> "
1275-
<< Remat.UseRegion << "] | " << *Remat.DefMI);
1269+
REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';);
12761270
// Every rematerialization we do here is likely to move the instruction
12771271
// into a higher frequency region, increasing the total sum latency of the
12781272
// instruction itself. This is acceptable if we are eliminating a spill in
@@ -1289,14 +1283,18 @@ bool PreRARematStage::initGCNSchedStage() {
12891283
++RoundNum;
12901284
#endif
12911285
REMAT_DEBUG({
1292-
if (!TargetRegions.any())
1293-
dbgs() << "*** Stop round on all targets achieved\n";
1294-
else if (RematIdx == -1)
1295-
dbgs() << "*** Stop round on exhausted remat opportunities\n";
1286+
if (!TargetRegions.any()) {
1287+
dbgs() << "** Interrupt round on all targets achieved\n";
1288+
} else if (RematIdx) {
1289+
dbgs() << "** Interrupt round on stale score for "
1290+
<< *ScoredRemats[RematIdx - 1].Remat->DefMI;
1291+
} else {
1292+
dbgs() << "** Stop on exhausted rematerialization candidates\n";
1293+
}
12961294
});
12971295

12981296
// Peel off registers we already rematerialized from the vector's tail.
1299-
ScoredRemats.truncate(RematIdx + 1);
1297+
ScoredRemats.truncate(RematIdx);
13001298
} while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) &&
13011299
!ScoredRemats.empty());
13021300
if (RescheduleRegions.none())
@@ -2008,20 +2006,11 @@ PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
20082006
if (!MinFreq)
20092007
return;
20102008

2011-
// Normalize to minimum observed frequency to avoid overflows when adding up
2012-
// frequencies.
2009+
// Normalize to minimum observed frequency to avoid underflows/overflows when
2010+
// combining frequencies.
20132011
for (uint64_t &Freq : Regions)
20142012
Freq /= MinFreq;
20152013
MaxFreq /= MinFreq;
2016-
2017-
// Compute the scaling factor for scoring frequency differences.
2018-
const uint64_t MaxDiff = MaxFreq - 1;
2019-
const uint64_t MaxReprFreqValue = (1 << FreqDiffWidth) - 1;
2020-
RescaleIsDenom = (2 * MaxDiff) & ~MaxReprFreqValue;
2021-
if (RescaleIsDenom)
2022-
RescaleFactor = (2 * MaxDiff) >> FreqDiffWidth;
2023-
else
2024-
RescaleFactor = MaxDiff ? MaxReprFreqValue / (2 * MaxDiff) : 1;
20252014
}
20262015

20272016
PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
@@ -2044,7 +2033,7 @@ unsigned PreRARematStage::ScoredRemat::getNumRegs(
20442033
return divideCeil(RegSize, 32);
20452034
}
20462035

2047-
uint64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
2036+
int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
20482037
// Get frequencies of defining and using regions. A rematerialization from the
20492038
// least frequent region to the most frequent region will yield the greatest
20502039
// latency penalty and therefore should get minimum score. Reciprocally, a
@@ -2056,36 +2045,22 @@ uint64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
20562045
uint64_t UseOrMax = Freq.Regions[Remat->UseRegion];
20572046
if (!UseOrMax)
20582047
UseOrMax = Freq.MaxFreq;
2059-
2060-
// Maximum difference in frequency between defining and using regions.
2061-
const uint64_t MaxDiff = Freq.MaxFreq - 1;
2062-
// The difference between defining and using frequency is in the range
2063-
// [-MaxDiff, MaxDiff], shift it to [0,2 x MaxDiff] to stay in the positive
2064-
// range, then rescale to the representable range in the final score.
2065-
const uint64_t FreqDiff = (MaxDiff + (DefOrOne - UseOrMax));
2066-
if (Freq.RescaleIsDenom)
2067-
return FreqDiff / Freq.RescaleFactor;
2068-
return FreqDiff * Freq.RescaleFactor;
2048+
return DefOrOne - UseOrMax;
20692049
}
20702050

20712051
void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
20722052
ArrayRef<GCNRPTarget> RPTargets,
20732053
const FreqInfo &FreqInfo,
20742054
bool ReduceSpill) {
2075-
setNullScore();
2076-
if (!Remat->maybeBeneficial(TargetRegions, RPTargets))
2077-
return;
2078-
2079-
Register Reg = Remat->getReg();
2080-
uint64_t MaxFreq = 0;
2081-
ScoreTy NumBenefitingRegions = 0;
2055+
MaxFreq = 0;
2056+
RegionImpact = 0;
20822057
for (unsigned I : TargetRegions.set_bits()) {
2083-
if (!Remat->Live[I] || !RPTargets[I].isSaveBeneficial(Reg))
2058+
if (!Remat->Live[I] || !RPTargets[I].isSaveBeneficial(Remat->getReg()))
20842059
continue;
20852060
bool UnusedLT = Remat->isUnusedLiveThrough(I);
20862061

20872062
// Regions in which RP is guaranteed to decrease have more weight.
2088-
NumBenefitingRegions += UnusedLT ? 2 : 1;
2063+
RegionImpact += UnusedLT ? 2 : 1;
20892064

20902065
if (ReduceSpill) {
20912066
uint64_t Freq = FreqInfo.Regions[I];
@@ -2097,9 +2072,6 @@ void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
20972072
MaxFreq = std::max(MaxFreq, Freq);
20982073
}
20992074
}
2100-
setMaxFreqScore(MaxFreq);
2101-
setFreqDiffScore(FreqDiff);
2102-
setRegionImpactScore(NumBenefitingRegions * NumRegs);
21032075
}
21042076

21052077
void PreRARematStage::rematerialize(const RematReg &Remat,

0 commit comments

Comments
 (0)