Skip to content

Commit 43c740c

Browse files
committed
Simplify score calculation and improve debug
- Removed some debug-only functions from the header and put them as close as possible to where they are needed. - Split bitpacked score into separate components compared 1-to-1.
1 parent b0f4cd6 commit 43c740c

File tree

2 files changed

+116
-180
lines changed

2 files changed

+116
-180
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 79 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -1281,46 +1281,10 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
12811281
#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
12821282

12831283
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1284-
void PreRARematStage::printTargetRegions(bool PrintAll) const {
1285-
if (PrintAll) {
1286-
for (auto [I, Target] : enumerate(RPTargets))
1287-
dbgs() << REMAT_PREFIX << " [" << I << "] " << Target << '\n';
1288-
return;
1289-
}
1290-
if (TargetRegions.none()) {
1291-
dbgs() << REMAT_PREFIX << "No target regions\n";
1292-
return;
1293-
}
1294-
dbgs() << REMAT_PREFIX << "Target regions:\n";
1295-
for (unsigned I : TargetRegions.set_bits())
1296-
dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1297-
}
1298-
1299-
void PreRARematStage::RematReg::print() const {
1300-
dbgs() << REMAT_PREFIX << " [" << DefRegion << "] " << *DefMI;
1301-
dbgs() << REMAT_PREFIX << " -> used in [" << UseRegion << "] " << *UseMI;
1302-
dbgs() << REMAT_PREFIX << " Guaranteed RP reduction in:";
1303-
for (unsigned I : Live.set_bits()) {
1304-
if (isUnusedLiveThrough(I))
1305-
dbgs() << " [" << I << "]";
1306-
}
1307-
dbgs() << '\n';
1308-
dbgs() << REMAT_PREFIX << " Possible RP reduction in:";
1309-
for (unsigned I : Live.set_bits()) {
1310-
if (!isUnusedLiveThrough(I))
1311-
dbgs() << " [" << I << "]";
1312-
}
1313-
dbgs() << '\n';
1314-
}
1315-
1316-
void PreRARematStage::ScoredRemat::print() const {
1317-
ScoreTy ShiftScore = Score;
1318-
ScoreTy RegionImpact = ShiftScore & ((1 << RegionImpactWidth) - 1);
1319-
ShiftScore >>= RegionImpactWidth;
1320-
ScoreTy FreqDiff = ShiftScore & ((1 << FreqDiffWidth) - 1);
1321-
ShiftScore >>= FreqDiffWidth;
1322-
ScoreTy MaxFreq = ShiftScore;
1323-
dbgs() << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1284+
Printable PreRARematStage::ScoredRemat::print() const {
1285+
return Printable([&](raw_ostream &OS) {
1286+
OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1287+
});
13241288
}
13251289
#endif
13261290

@@ -1351,6 +1315,38 @@ bool PreRARematStage::initGCNSchedStage() {
13511315
RegionBB.push_back(ParentMBB);
13521316
}
13531317

1318+
#ifndef NDEBUG
1319+
auto PrintTargetRegions = [&]() -> void {
1320+
if (TargetRegions.none()) {
1321+
dbgs() << REMAT_PREFIX << "No target regions\n";
1322+
return;
1323+
}
1324+
dbgs() << REMAT_PREFIX << "Target regions:\n";
1325+
for (unsigned I : TargetRegions.set_bits())
1326+
dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1327+
};
1328+
auto PrintRematReg = [&](const RematReg &Remat) -> Printable {
1329+
return Printable([&, Remat](raw_ostream &OS) {
1330+
// Concatenate all region numbers in which the register is unused and
1331+
// live-through.
1332+
std::string UnusedLTRegions;
1333+
for (unsigned I = 0; I < NumRegions; ++I) {
1334+
if (Remat.isUnusedLiveThrough(I)) {
1335+
if (!UnusedLTRegions.empty())
1336+
UnusedLTRegions += ",";
1337+
UnusedLTRegions += std::to_string(I);
1338+
}
1339+
}
1340+
if (!UnusedLTRegions.empty())
1341+
UnusedLTRegions = "- " + UnusedLTRegions + " -";
1342+
OS << "[" << Remat.DefRegion << " -" << UnusedLTRegions << "> "
1343+
<< Remat.UseRegion << "] ";
1344+
Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
1345+
/*SkipDebugLoc=*/false, /*AddNewLine=*/false);
1346+
});
1347+
};
1348+
#endif
1349+
13541350
// Set an objective for the stage based on current RP in each region.
13551351
REMAT_DEBUG({
13561352
dbgs() << "Analyzing ";
@@ -1369,22 +1365,19 @@ bool PreRARematStage::initGCNSchedStage() {
13691365
dbgs() << "reduce spilling (minimum target occupancy is "
13701366
<< MFI.getMinWavesPerEU() << ")\n";
13711367
}
1372-
printTargetRegions(/*PrintAll=*/TargetRegions.none());
1368+
PrintTargetRegions();
13731369
});
13741370

13751371
if (!collectRematRegs(MIRegion)) {
13761372
REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
13771373
return false;
13781374
}
1375+
const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
13791376
REMAT_DEBUG({
13801377
dbgs() << "Rematerializable registers:\n";
13811378
for (const RematReg &Remat : RematRegs)
1382-
Remat.print();
1383-
});
1384-
1385-
const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1386-
REMAT_DEBUG({
1387-
dbgs() << "Region frequencies\n";
1379+
dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << '\n';
1380+
dbgs() << REMAT_PREFIX << "Region frequencies\n";
13881381
for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
13891382
dbgs() << REMAT_PREFIX << " [" << I << "] ";
13901383
if (Freq)
@@ -1406,52 +1399,53 @@ bool PreRARematStage::initGCNSchedStage() {
14061399
#endif
14071400
BitVector RecomputeRP(NumRegions);
14081401
do {
1402+
assert(!ScoredRemats.empty() && "no more remat candidates");
1403+
14091404
// (Re-)Score and (re-)sort all remats in increasing score order.
14101405
for (ScoredRemat &Remat : ScoredRemats)
14111406
Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
14121407
sort(ScoredRemats);
14131408

14141409
REMAT_DEBUG({
1415-
dbgs() << "==== ROUND " << RoundNum << " ====\n";
1416-
for (const ScoredRemat &SRemat : ScoredRemats) {
1417-
dbgs() << REMAT_PREFIX;
1418-
SRemat.print();
1419-
dbgs() << " | " << *SRemat.Remat->DefMI;
1410+
dbgs() << "==== ROUND " << RoundNum << " ====\n"
1411+
<< REMAT_PREFIX
1412+
<< "Candidates with non-null score, in rematerialization order:\n";
1413+
for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) {
1414+
if (RematDecision.hasNullScore())
1415+
break;
1416+
dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " | "
1417+
<< *RematDecision.Remat->DefMI;
14201418
}
1421-
printTargetRegions();
1419+
PrintTargetRegions();
14221420
});
14231421

14241422
RecomputeRP.reset();
1425-
int RematIdx = ScoredRemats.size() - 1;
1423+
unsigned RematIdx = ScoredRemats.size();
14261424

14271425
// Rematerialize registers in decreasing score order until we estimate
14281426
// that all RP targets are satisfied or until rematerialization candidates
14291427
// are no longer useful to decrease RP.
1430-
for (; RematIdx >= 0 && TargetRegions.any(); --RematIdx) {
1431-
const RematReg &Remat = *ScoredRemats[RematIdx].Remat;
1432-
// Stop on null score. Since scores monotonically decrease as we
1433-
// rematerialize, we know there is nothing useful left to do in such
1434-
// cases.
1435-
if (ScoredRemats[RematIdx].hasNullScore()) {
1436-
REMAT_DEBUG(dbgs() << "*** Stop on null score | " << *Remat.DefMI);
1437-
RematIdx = -1;
1428+
for (; RematIdx && TargetRegions.any(); --RematIdx) {
1429+
const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1];
1430+
// Stop rematerializing on encountering a null score. Since scores
1431+
// monotonically decrease as we rematerialize, we know there is nothing
1432+
// useful left to do in such cases, even if we were to re-score.
1433+
if (Candidate.hasNullScore()) {
1434+
RematIdx = 0;
14381435
break;
14391436
}
14401437

1438+
const RematReg &Remat = *Candidate.Remat;
14411439
// When previous rematerializations in this round have already satisfied
14421440
// RP targets in all regions this rematerialization can impact, we have a
14431441
// good indication that our scores have diverged significantly from
14441442
// reality, in which case we interrupt this round and re-score. This also
14451443
// ensures that every rematerialization we perform is possibly impactful
14461444
// in at least one target region.
1447-
if (!Remat.maybeBeneficial(TargetRegions, RPTargets)) {
1448-
REMAT_DEBUG(dbgs() << "*** Stop round on stale score | "
1449-
<< *Remat.DefMI);
1445+
if (!Remat.maybeBeneficial(TargetRegions, RPTargets))
14501446
break;
1451-
}
14521447

1453-
REMAT_DEBUG(dbgs() << "*** REMAT [" << Remat.DefRegion << " -> "
1454-
<< Remat.UseRegion << "] | " << *Remat.DefMI);
1448+
REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';);
14551449
// Every rematerialization we do here is likely to move the instruction
14561450
// into a higher frequency region, increasing the total sum latency of the
14571451
// instruction itself. This is acceptable if we are eliminating a spill in
@@ -1468,14 +1462,18 @@ bool PreRARematStage::initGCNSchedStage() {
14681462
++RoundNum;
14691463
#endif
14701464
REMAT_DEBUG({
1471-
if (!TargetRegions.any())
1472-
dbgs() << "*** Stop round on all targets achieved\n";
1473-
else if (RematIdx == -1)
1474-
dbgs() << "*** Stop round on exhausted remat opportunities\n";
1465+
if (!TargetRegions.any()) {
1466+
dbgs() << "** Interrupt round on all targets achieved\n";
1467+
} else if (RematIdx) {
1468+
dbgs() << "** Interrupt round on stale score for "
1469+
<< *ScoredRemats[RematIdx - 1].Remat->DefMI;
1470+
} else {
1471+
dbgs() << "** Stop on exhausted rematerialization candidates\n";
1472+
}
14751473
});
14761474

14771475
// Peel off registers we already rematerialized from the vector's tail.
1478-
ScoredRemats.truncate(RematIdx + 1);
1476+
ScoredRemats.truncate(RematIdx);
14791477
} while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) &&
14801478
!ScoredRemats.empty());
14811479
if (RescheduleRegions.none())
@@ -2208,20 +2206,11 @@ PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
22082206
if (!MinFreq)
22092207
return;
22102208

2211-
// Normalize to minimum observed frequency to avoid overflows when adding up
2212-
// frequencies.
2209+
// Normalize to minimum observed frequency to avoid underflows/overflows when
2210+
// combining frequencies.
22132211
for (uint64_t &Freq : Regions)
22142212
Freq /= MinFreq;
22152213
MaxFreq /= MinFreq;
2216-
2217-
// Compute the scaling factor for scoring frequency differences.
2218-
const uint64_t MaxDiff = MaxFreq - 1;
2219-
const uint64_t MaxReprFreqValue = (1 << FreqDiffWidth) - 1;
2220-
RescaleIsDenom = (2 * MaxDiff) & ~MaxReprFreqValue;
2221-
if (RescaleIsDenom)
2222-
RescaleFactor = (2 * MaxDiff) >> FreqDiffWidth;
2223-
else
2224-
RescaleFactor = MaxDiff ? MaxReprFreqValue / (2 * MaxDiff) : 1;
22252214
}
22262215

22272216
PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
@@ -2244,7 +2233,7 @@ unsigned PreRARematStage::ScoredRemat::getNumRegs(
22442233
return divideCeil(RegSize, 32);
22452234
}
22462235

2247-
uint64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
2236+
int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
22482237
// Get frequencies of defining and using regions. A rematerialization from the
22492238
// least frequent region to the most frequent region will yield the greatest
22502239
// latency penalty and therefore should get minimum score. Reciprocally, a
@@ -2256,36 +2245,22 @@ uint64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
22562245
uint64_t UseOrMax = Freq.Regions[Remat->UseRegion];
22572246
if (!UseOrMax)
22582247
UseOrMax = Freq.MaxFreq;
2259-
2260-
// Maximum difference in frequency between defining and using regions.
2261-
const uint64_t MaxDiff = Freq.MaxFreq - 1;
2262-
// The difference between defining and using frequency is in the range
2263-
// [-MaxDiff, MaxDiff], shift it to [0,2 x MaxDiff] to stay in the positive
2264-
// range, then rescale to the representable range in the final score.
2265-
const uint64_t FreqDiff = (MaxDiff + (DefOrOne - UseOrMax));
2266-
if (Freq.RescaleIsDenom)
2267-
return FreqDiff / Freq.RescaleFactor;
2268-
return FreqDiff * Freq.RescaleFactor;
2248+
return DefOrOne - UseOrMax;
22692249
}
22702250

22712251
void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
22722252
ArrayRef<GCNRPTarget> RPTargets,
22732253
const FreqInfo &FreqInfo,
22742254
bool ReduceSpill) {
2275-
setNullScore();
2276-
if (!Remat->maybeBeneficial(TargetRegions, RPTargets))
2277-
return;
2278-
2279-
Register Reg = Remat->getReg();
2280-
uint64_t MaxFreq = 0;
2281-
ScoreTy NumBenefitingRegions = 0;
2255+
MaxFreq = 0;
2256+
RegionImpact = 0;
22822257
for (unsigned I : TargetRegions.set_bits()) {
2283-
if (!Remat->Live[I] || !RPTargets[I].isSaveBeneficial(Reg))
2258+
if (!Remat->Live[I] || !RPTargets[I].isSaveBeneficial(Remat->getReg()))
22842259
continue;
22852260
bool UnusedLT = Remat->isUnusedLiveThrough(I);
22862261

22872262
// Regions in which RP is guaranteed to decrease have more weight.
2288-
NumBenefitingRegions += UnusedLT ? 2 : 1;
2263+
RegionImpact += UnusedLT ? 2 : 1;
22892264

22902265
if (ReduceSpill) {
22912266
uint64_t Freq = FreqInfo.Regions[I];
@@ -2297,9 +2272,6 @@ void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
22972272
MaxFreq = std::max(MaxFreq, Freq);
22982273
}
22992274
}
2300-
setMaxFreqScore(MaxFreq);
2301-
setFreqDiffScore(FreqDiff);
2302-
setRegionImpactScore(NumBenefitingRegions * NumRegs);
23032275
}
23042276

23052277
void PreRARematStage::rematerialize(const RematReg &Remat,

0 commit comments

Comments
 (0)