Skip to content

Commit 474f5d8

Browse files
committed
Simplify score calculation and improve debug
- Removed some debug-only functions from the header and put them as close as possible to where they are needed. - Split bitpacked score into separate components compared 1-to-1.
1 parent 6dd8bf2 commit 474f5d8

File tree

2 files changed

+116
-180
lines changed

2 files changed

+116
-180
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 79 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -1279,46 +1279,10 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
12791279
#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
12801280

12811281
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1282-
void PreRARematStage::printTargetRegions(bool PrintAll) const {
1283-
if (PrintAll) {
1284-
for (auto [I, Target] : enumerate(RPTargets))
1285-
dbgs() << REMAT_PREFIX << " [" << I << "] " << Target << '\n';
1286-
return;
1287-
}
1288-
if (TargetRegions.none()) {
1289-
dbgs() << REMAT_PREFIX << "No target regions\n";
1290-
return;
1291-
}
1292-
dbgs() << REMAT_PREFIX << "Target regions:\n";
1293-
for (unsigned I : TargetRegions.set_bits())
1294-
dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1295-
}
1296-
1297-
void PreRARematStage::RematReg::print() const {
1298-
dbgs() << REMAT_PREFIX << " [" << DefRegion << "] " << *DefMI;
1299-
dbgs() << REMAT_PREFIX << " -> used in [" << UseRegion << "] " << *UseMI;
1300-
dbgs() << REMAT_PREFIX << " Guaranteed RP reduction in:";
1301-
for (unsigned I : Live.set_bits()) {
1302-
if (isUnusedLiveThrough(I))
1303-
dbgs() << " [" << I << "]";
1304-
}
1305-
dbgs() << '\n';
1306-
dbgs() << REMAT_PREFIX << " Possible RP reduction in:";
1307-
for (unsigned I : Live.set_bits()) {
1308-
if (!isUnusedLiveThrough(I))
1309-
dbgs() << " [" << I << "]";
1310-
}
1311-
dbgs() << '\n';
1312-
}
1313-
1314-
void PreRARematStage::ScoredRemat::print() const {
1315-
ScoreTy ShiftScore = Score;
1316-
ScoreTy RegionImpact = ShiftScore & ((1 << RegionImpactWidth) - 1);
1317-
ShiftScore >>= RegionImpactWidth;
1318-
ScoreTy FreqDiff = ShiftScore & ((1 << FreqDiffWidth) - 1);
1319-
ShiftScore >>= FreqDiffWidth;
1320-
ScoreTy MaxFreq = ShiftScore;
1321-
dbgs() << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1282+
Printable PreRARematStage::ScoredRemat::print() const {
1283+
return Printable([&](raw_ostream &OS) {
1284+
OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1285+
});
13221286
}
13231287
#endif
13241288

@@ -1349,6 +1313,38 @@ bool PreRARematStage::initGCNSchedStage() {
13491313
RegionBB.push_back(ParentMBB);
13501314
}
13511315

1316+
#ifndef NDEBUG
1317+
auto PrintTargetRegions = [&]() -> void {
1318+
if (TargetRegions.none()) {
1319+
dbgs() << REMAT_PREFIX << "No target regions\n";
1320+
return;
1321+
}
1322+
dbgs() << REMAT_PREFIX << "Target regions:\n";
1323+
for (unsigned I : TargetRegions.set_bits())
1324+
dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1325+
};
1326+
auto PrintRematReg = [&](const RematReg &Remat) -> Printable {
1327+
return Printable([&, Remat](raw_ostream &OS) {
1328+
// Concatenate all region numbers in which the register is unused and
1329+
// live-through.
1330+
std::string UnusedLTRegions;
1331+
for (unsigned I = 0; I < NumRegions; ++I) {
1332+
if (Remat.isUnusedLiveThrough(I)) {
1333+
if (!UnusedLTRegions.empty())
1334+
UnusedLTRegions += ",";
1335+
UnusedLTRegions += std::to_string(I);
1336+
}
1337+
}
1338+
if (!UnusedLTRegions.empty())
1339+
UnusedLTRegions = "- " + UnusedLTRegions + " -";
1340+
OS << "[" << Remat.DefRegion << " -" << UnusedLTRegions << "> "
1341+
<< Remat.UseRegion << "] ";
1342+
Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
1343+
/*SkipDebugLoc=*/false, /*AddNewLine=*/false);
1344+
});
1345+
};
1346+
#endif
1347+
13521348
// Set an objective for the stage based on current RP in each region.
13531349
REMAT_DEBUG({
13541350
dbgs() << "Analyzing ";
@@ -1367,22 +1363,19 @@ bool PreRARematStage::initGCNSchedStage() {
13671363
dbgs() << "reduce spilling (minimum target occupancy is "
13681364
<< MFI.getMinWavesPerEU() << ")\n";
13691365
}
1370-
printTargetRegions(/*PrintAll=*/TargetRegions.none());
1366+
PrintTargetRegions();
13711367
});
13721368

13731369
if (!collectRematRegs(MIRegion)) {
13741370
REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
13751371
return false;
13761372
}
1373+
const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
13771374
REMAT_DEBUG({
13781375
dbgs() << "Rematerializable registers:\n";
13791376
for (const RematReg &Remat : RematRegs)
1380-
Remat.print();
1381-
});
1382-
1383-
const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1384-
REMAT_DEBUG({
1385-
dbgs() << "Region frequencies\n";
1377+
dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << '\n';
1378+
dbgs() << REMAT_PREFIX << "Region frequencies\n";
13861379
for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
13871380
dbgs() << REMAT_PREFIX << " [" << I << "] ";
13881381
if (Freq)
@@ -1404,52 +1397,53 @@ bool PreRARematStage::initGCNSchedStage() {
14041397
#endif
14051398
BitVector RecomputeRP(NumRegions);
14061399
do {
1400+
assert(!ScoredRemats.empty() && "no more remat candidates");
1401+
14071402
// (Re-)Score and (re-)sort all remats in increasing score order.
14081403
for (ScoredRemat &Remat : ScoredRemats)
14091404
Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
14101405
sort(ScoredRemats);
14111406

14121407
REMAT_DEBUG({
1413-
dbgs() << "==== ROUND " << RoundNum << " ====\n";
1414-
for (const ScoredRemat &SRemat : ScoredRemats) {
1415-
dbgs() << REMAT_PREFIX;
1416-
SRemat.print();
1417-
dbgs() << " | " << *SRemat.Remat->DefMI;
1408+
dbgs() << "==== ROUND " << RoundNum << " ====\n"
1409+
<< REMAT_PREFIX
1410+
<< "Candidates with non-null score, in rematerialization order:\n";
1411+
for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) {
1412+
if (RematDecision.hasNullScore())
1413+
break;
1414+
dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " | "
1415+
<< *RematDecision.Remat->DefMI;
14181416
}
1419-
printTargetRegions();
1417+
PrintTargetRegions();
14201418
});
14211419

14221420
RecomputeRP.reset();
1423-
int RematIdx = ScoredRemats.size() - 1;
1421+
unsigned RematIdx = ScoredRemats.size();
14241422

14251423
// Rematerialize registers in decreasing score order until we estimate
14261424
// that all RP targets are satisfied or until rematerialization candidates
14271425
// are no longer useful to decrease RP.
1428-
for (; RematIdx >= 0 && TargetRegions.any(); --RematIdx) {
1429-
const RematReg &Remat = *ScoredRemats[RematIdx].Remat;
1430-
// Stop on null score. Since scores monotonically decrease as we
1431-
// rematerialize, we know there is nothing useful left to do in such
1432-
// cases.
1433-
if (ScoredRemats[RematIdx].hasNullScore()) {
1434-
REMAT_DEBUG(dbgs() << "*** Stop on null score | " << *Remat.DefMI);
1435-
RematIdx = -1;
1426+
for (; RematIdx && TargetRegions.any(); --RematIdx) {
1427+
const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1];
1428+
// Stop rematerializing on encountering a null score. Since scores
1429+
// monotonically decrease as we rematerialize, we know there is nothing
1430+
// useful left to do in such cases, even if we were to re-score.
1431+
if (Candidate.hasNullScore()) {
1432+
RematIdx = 0;
14361433
break;
14371434
}
14381435

1436+
const RematReg &Remat = *Candidate.Remat;
14391437
// When previous rematerializations in this round have already satisfied
14401438
// RP targets in all regions this rematerialization can impact, we have a
14411439
// good indication that our scores have diverged significantly from
14421440
// reality, in which case we interrupt this round and re-score. This also
14431441
// ensures that every rematerialization we perform is possibly impactful
14441442
// in at least one target region.
1445-
if (!Remat.maybeBeneficial(TargetRegions, RPTargets)) {
1446-
REMAT_DEBUG(dbgs() << "*** Stop round on stale score | "
1447-
<< *Remat.DefMI);
1443+
if (!Remat.maybeBeneficial(TargetRegions, RPTargets))
14481444
break;
1449-
}
14501445

1451-
REMAT_DEBUG(dbgs() << "*** REMAT [" << Remat.DefRegion << " -> "
1452-
<< Remat.UseRegion << "] | " << *Remat.DefMI);
1446+
REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';);
14531447
// Every rematerialization we do here is likely to move the instruction
14541448
// into a higher frequency region, increasing the total sum latency of the
14551449
// instruction itself. This is acceptable if we are eliminating a spill in
@@ -1466,14 +1460,18 @@ bool PreRARematStage::initGCNSchedStage() {
14661460
++RoundNum;
14671461
#endif
14681462
REMAT_DEBUG({
1469-
if (!TargetRegions.any())
1470-
dbgs() << "*** Stop round on all targets achieved\n";
1471-
else if (RematIdx == -1)
1472-
dbgs() << "*** Stop round on exhausted remat opportunities\n";
1463+
if (!TargetRegions.any()) {
1464+
dbgs() << "** Interrupt round on all targets achieved\n";
1465+
} else if (RematIdx) {
1466+
dbgs() << "** Interrupt round on stale score for "
1467+
<< *ScoredRemats[RematIdx - 1].Remat->DefMI;
1468+
} else {
1469+
dbgs() << "** Stop on exhausted rematerialization candidates\n";
1470+
}
14731471
});
14741472

14751473
// Peel off registers we already rematerialized from the vector's tail.
1476-
ScoredRemats.truncate(RematIdx + 1);
1474+
ScoredRemats.truncate(RematIdx);
14771475
} while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) &&
14781476
!ScoredRemats.empty());
14791477
if (RescheduleRegions.none())
@@ -2185,20 +2183,11 @@ PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
21852183
if (!MinFreq)
21862184
return;
21872185

2188-
// Normalize to minimum observed frequency to avoid overflows when adding up
2189-
// frequencies.
2186+
// Normalize to minimum observed frequency to avoid underflows/overflows when
2187+
// combining frequencies.
21902188
for (uint64_t &Freq : Regions)
21912189
Freq /= MinFreq;
21922190
MaxFreq /= MinFreq;
2193-
2194-
// Compute the scaling factor for scoring frequency differences.
2195-
const uint64_t MaxDiff = MaxFreq - 1;
2196-
const uint64_t MaxReprFreqValue = (1 << FreqDiffWidth) - 1;
2197-
RescaleIsDenom = (2 * MaxDiff) & ~MaxReprFreqValue;
2198-
if (RescaleIsDenom)
2199-
RescaleFactor = (2 * MaxDiff) >> FreqDiffWidth;
2200-
else
2201-
RescaleFactor = MaxDiff ? MaxReprFreqValue / (2 * MaxDiff) : 1;
22022191
}
22032192

22042193
PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
@@ -2221,7 +2210,7 @@ unsigned PreRARematStage::ScoredRemat::getNumRegs(
22212210
return divideCeil(RegSize, 32);
22222211
}
22232212

2224-
uint64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
2213+
int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
22252214
// Get frequencies of defining and using regions. A rematerialization from the
22262215
// least frequent region to the most frequent region will yield the greatest
22272216
// latency penalty and therefore should get minimum score. Reciprocally, a
@@ -2233,36 +2222,22 @@ uint64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
22332222
uint64_t UseOrMax = Freq.Regions[Remat->UseRegion];
22342223
if (!UseOrMax)
22352224
UseOrMax = Freq.MaxFreq;
2236-
2237-
// Maximum difference in frequency between defining and using regions.
2238-
const uint64_t MaxDiff = Freq.MaxFreq - 1;
2239-
// The difference between defining and using frequency is in the range
2240-
// [-MaxDiff, MaxDiff], shift it to [0,2 x MaxDiff] to stay in the positive
2241-
// range, then rescale to the representable range in the final score.
2242-
const uint64_t FreqDiff = (MaxDiff + (DefOrOne - UseOrMax));
2243-
if (Freq.RescaleIsDenom)
2244-
return FreqDiff / Freq.RescaleFactor;
2245-
return FreqDiff * Freq.RescaleFactor;
2225+
return DefOrOne - UseOrMax;
22462226
}
22472227

22482228
void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
22492229
ArrayRef<GCNRPTarget> RPTargets,
22502230
const FreqInfo &FreqInfo,
22512231
bool ReduceSpill) {
2252-
setNullScore();
2253-
if (!Remat->maybeBeneficial(TargetRegions, RPTargets))
2254-
return;
2255-
2256-
Register Reg = Remat->getReg();
2257-
uint64_t MaxFreq = 0;
2258-
ScoreTy NumBenefitingRegions = 0;
2232+
MaxFreq = 0;
2233+
RegionImpact = 0;
22592234
for (unsigned I : TargetRegions.set_bits()) {
2260-
if (!Remat->Live[I] || !RPTargets[I].isSaveBeneficial(Reg))
2235+
if (!Remat->Live[I] || !RPTargets[I].isSaveBeneficial(Remat->getReg()))
22612236
continue;
22622237
bool UnusedLT = Remat->isUnusedLiveThrough(I);
22632238

22642239
// Regions in which RP is guaranteed to decrease have more weight.
2265-
NumBenefitingRegions += UnusedLT ? 2 : 1;
2240+
RegionImpact += UnusedLT ? 2 : 1;
22662241

22672242
if (ReduceSpill) {
22682243
uint64_t Freq = FreqInfo.Regions[I];
@@ -2274,9 +2249,6 @@ void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
22742249
MaxFreq = std::max(MaxFreq, Freq);
22752250
}
22762251
}
2277-
setMaxFreqScore(MaxFreq);
2278-
setFreqDiffScore(FreqDiff);
2279-
setRegionImpactScore(NumBenefitingRegions * NumRegs);
22802252
}
22812253

22822254
void PreRARematStage::rematerialize(const RematReg &Remat,

0 commit comments

Comments
 (0)