Skip to content

Commit 68cd753

Browse files
committed
Correctly derive (sub)reg size and frequency fix
In cases where the (normalized) maximum region frequency is lower than the number of different frequency scores representable (2^16 currently), the current method of computing the scaling factor shrinks the actually achievable range of frequency scores available (up to a single achievable score in the most extreme cases). This is due to integer rounding when dividing frequencies to fit within the representable range. When the maximum frequency is smaller than the maximum score achievable, use a different rescaling calculation to avoid the expressivity loss.
1 parent 15d74b1 commit 68cd753

File tree

2 files changed

+68
-39
lines changed

2 files changed

+68
-39
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 48 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,6 +1204,18 @@ bool PreRARematStage::initGCNSchedStage() {
12041204
});
12051205

12061206
const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1207+
REMAT_DEBUG({
1208+
dbgs() << "Region frequencies\n";
1209+
for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
1210+
dbgs() << REMAT_PREFIX << " [" << I << "] ";
1211+
if (Freq)
1212+
dbgs() << Freq;
1213+
else
1214+
dbgs() << "unknown ";
1215+
dbgs() << " | " << *DAG.Regions[I].first;
1216+
}
1217+
});
1218+
12071219
SmallVector<ScoredRemat> ScoredRemats;
12081220
for (const RematReg &Remat : RematRegs)
12091221
ScoredRemats.emplace_back(&Remat, FreqInfo, DAG);
@@ -1982,7 +1994,6 @@ PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
19821994
const unsigned NumRegions = DAG.Regions.size();
19831995
uint64_t MinFreq = MBFI.getEntryFreq().getFrequency();
19841996
Regions.reserve(NumRegions);
1985-
MaxFreq = 0;
19861997
for (unsigned I = 0; I < NumRegions; ++I) {
19871998
MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
19881999
uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
@@ -1992,25 +2003,23 @@ PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
19922003
else if (BlockFreq > MaxFreq)
19932004
MaxFreq = BlockFreq;
19942005
}
1995-
if (MinFreq) {
1996-
// Normalize to minimum observed frequency to avoid overflows when adding up
1997-
// frequencies.
1998-
for (uint64_t &Freq : Regions)
1999-
Freq /= MinFreq;
2000-
MaxFreq /= MinFreq;
2001-
}
2006+
if (!MinFreq)
2007+
return;
20022008

2003-
REMAT_DEBUG({
2004-
dbgs() << "Region frequencies\n";
2005-
for (auto [I, Freq] : enumerate(Regions)) {
2006-
dbgs() << REMAT_PREFIX << " [" << I << "] ";
2007-
if (Freq)
2008-
dbgs() << Freq;
2009-
else
2010-
dbgs() << "unknown ";
2011-
dbgs() << " | " << *DAG.Regions[I].first;
2012-
}
2013-
});
2009+
// Normalize to minimum observed frequency to avoid overflows when adding up
2010+
// frequencies.
2011+
for (uint64_t &Freq : Regions)
2012+
Freq /= MinFreq;
2013+
MaxFreq /= MinFreq;
2014+
2015+
// Compute the scaling factor for scoring frequency differences.
2016+
const uint64_t MaxDiff = MaxFreq - 1;
2017+
const uint64_t MaxReprFreqValue = (1 << FreqDiffWidth) - 1;
2018+
RescaleIsDenom = (2 * MaxDiff) & ~MaxReprFreqValue;
2019+
if (RescaleIsDenom)
2020+
RescaleFactor = (2 * MaxDiff) >> FreqDiffWidth;
2021+
else
2022+
RescaleFactor = MaxDiff ? MaxReprFreqValue / (2 * MaxDiff) : 1;
20142023
}
20152024

20162025
PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
@@ -2020,39 +2029,43 @@ PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
20202029

20212030
unsigned PreRARematStage::ScoredRemat::getNumRegs(
20222031
const GCNScheduleDAGMILive &DAG) const {
2023-
// FIXME: this doesn't account for the fact that the rematerialization may be
2024-
// for a subregister. In that case we will overestimate the number of
2025-
// registers involved. This is acceptable since this is purely used for the
2026-
// scoring heuristic, but we should find a way to compute the number of
2027-
// registers actually covered by the register/subregister pair.
20282032
const TargetRegisterClass &RC = *DAG.MRI.getRegClass(Remat->getReg());
2029-
return divideCeil(DAG.TRI->getRegSizeInBits(RC), 32);
2033+
unsigned RegSize = DAG.TRI->getRegSizeInBits(RC);
2034+
if (unsigned SubIdx = Remat->DefMI->getOperand(0).getSubReg()) {
2035+
// The following may return -1 (i.e., a large unsigned number) on indices
2036+
// that may be used to access subregisters of multiple sizes; in such cases
2037+
// fallback on the size derived from the register class.
2038+
unsigned SubRegSize = DAG.TRI->getSubRegIdxSize(SubIdx);
2039+
if (SubRegSize < RegSize)
2040+
RegSize = SubRegSize;
2041+
}
2042+
return divideCeil(RegSize, 32);
20302043
}
20312044

2032-
uint64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Info) const {
2045+
uint64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
20332046
// Get frequencies of defining and using regions. A rematerialization from the
20342047
// least frequent region to the most frequent region will yield the greatest
20352048
// latency penalty and therefore should get minimum score. Reciprocally, a
20362049
// rematerialization in the other direction should get maximum score. Default
20372050
// to values that will yield the worst possible score given known frequencies
20382051
// in order to penalize rematerializations from or into regions whose
20392052
// frequency is unknown.
2040-
uint64_t DefOrOne = Info.Regions[Remat->DefRegion];
2053+
uint64_t DefOrOne = Freq.Regions[Remat->DefRegion];
20412054
if (!DefOrOne)
20422055
DefOrOne = 1;
2043-
uint64_t UseOrMax = Info.Regions[Remat->UseRegion];
2056+
uint64_t UseOrMax = Freq.Regions[Remat->UseRegion];
20442057
if (!UseOrMax)
2045-
UseOrMax = Info.MaxFreq;
2058+
UseOrMax = Freq.MaxFreq;
20462059

20472060
// Maximum difference in frequency between defining and using regions.
2048-
const uint64_t MaxDiff = Info.MaxFreq - 1;
2049-
// This is equivalent to max( (2 * MaxDiff) / 2^NumBitsLatency , 1 ).
2050-
const uint64_t RescaleDenom =
2051-
std::max(MaxDiff >> (FreqDiffWidth - 1), (uint64_t)1);
2061+
const uint64_t MaxDiff = Freq.MaxFreq - 1;
20522062
// The difference between defining and using frequency is in the range
20532063
// [-MaxDiff, MaxDiff], shift it to [0,2 x MaxDiff] to stay in the positive
2054-
// range, then rescale to [0, 2^NumBitsLatency - 1]
2055-
return (MaxDiff + (DefOrOne - UseOrMax)) / RescaleDenom;
2064+
// range, then rescale to the representable range in the final score.
2065+
const uint64_t FreqDiff = (MaxDiff + (DefOrOne - UseOrMax));
2066+
if (Freq.RescaleIsDenom)
2067+
return FreqDiff / Freq.RescaleFactor;
2068+
return FreqDiff * Freq.RescaleFactor;
20562069
}
20572070

20582071
void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -505,7 +505,14 @@ class PreRARematStage : public GCNSchedStage {
505505
/// frequency. 0 when unknown.
506506
SmallVector<uint64_t> Regions;
507507
/// Maximum observed frequency, normalized to minimum observed frequency.
508-
uint64_t MaxFreq;
508+
uint64_t MaxFreq = 0;
509+
/// Rescaling factor for scoring frequency differences in the range [0, 2
510+
/// * (MaxFreq - 1)].
511+
uint64_t RescaleFactor = 0;
512+
/// Whether the rescaling factor should be used as a denominator (when the
513+
/// maximum frequency is "big") or as a nominator (when the maximum
514+
/// frequency is "small").
515+
bool RescaleIsDenom = false;
509516

510517
FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG);
511518
};
@@ -562,20 +569,29 @@ class PreRARematStage : public GCNSchedStage {
562569
void setMaxFreqScore(ScoreTy MaxFreq) {
563570
MaxFreq = std::min(
564571
static_cast<ScoreTy>(std::numeric_limits<uint32_t>::max()), MaxFreq);
565-
Score |= MaxFreq << (FreqDiffWidth + RegionImpactWidth);
572+
MaxFreq <<= FreqDiffWidth + RegionImpactWidth;
573+
574+
ScoreTy Mask = ((ScoreTy)1 << (FreqDiffWidth + RegionImpactWidth)) - 1;
575+
Score = MaxFreq | (Score & Mask);
566576
}
567577

568578
void setFreqDiffScore(ScoreTy FreqDiff) {
569579
FreqDiff = std::min(
570580
static_cast<ScoreTy>(std::numeric_limits<uint16_t>::max()), FreqDiff);
571-
Score |= FreqDiff << RegionImpactWidth;
581+
FreqDiff <<= RegionImpactWidth;
582+
583+
ScoreTy Mask = ((ScoreTy)1 << (FreqDiffWidth)) - 1;
584+
Mask <<= RegionImpactWidth;
585+
Score = FreqDiff | (Score & ~Mask);
572586
}
573587

574588
void setRegionImpactScore(ScoreTy RegionImpact) {
575589
RegionImpact =
576590
std::min(static_cast<ScoreTy>(std::numeric_limits<uint16_t>::max()),
577591
RegionImpact);
578-
Score |= RegionImpact;
592+
593+
ScoreTy Mask = ((ScoreTy)1 << (RegionImpactWidth)) - 1;
594+
Score = RegionImpact | (Score & ~Mask);
579595
}
580596

581597
unsigned getNumRegs(const GCNScheduleDAGMILive &DAG) const;

0 commit comments

Comments
 (0)