Skip to content

Commit fde49a3

Browse files
committed
Correctly derive (sub)reg size and frequency fix
In cases where the (normalized) maximum region frequency is lower than the number of different frequency scores representable (2^16 currently), the current method of computing the scaling factor shrinks the actually achievable range of frequency scores available (up to a single achievable score in the most extreme cases). This is due to integer rounding when dividing frequencies to fit within the representable range. When the maximum frequency is smaller than the maximum score achievable, use a different rescaling calculation to avoid the expressivity loss.
1 parent f57f3bd commit fde49a3

File tree

2 files changed

+68
-39
lines changed

2 files changed

+68
-39
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 48 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1383,6 +1383,18 @@ bool PreRARematStage::initGCNSchedStage() {
13831383
});
13841384

13851385
const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1386+
REMAT_DEBUG({
1387+
dbgs() << "Region frequencies\n";
1388+
for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
1389+
dbgs() << REMAT_PREFIX << " [" << I << "] ";
1390+
if (Freq)
1391+
dbgs() << Freq;
1392+
else
1393+
dbgs() << "unknown ";
1394+
dbgs() << " | " << *DAG.Regions[I].first;
1395+
}
1396+
});
1397+
13861398
SmallVector<ScoredRemat> ScoredRemats;
13871399
for (const RematReg &Remat : RematRegs)
13881400
ScoredRemats.emplace_back(&Remat, FreqInfo, DAG);
@@ -2182,7 +2194,6 @@ PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
21822194
const unsigned NumRegions = DAG.Regions.size();
21832195
uint64_t MinFreq = MBFI.getEntryFreq().getFrequency();
21842196
Regions.reserve(NumRegions);
2185-
MaxFreq = 0;
21862197
for (unsigned I = 0; I < NumRegions; ++I) {
21872198
MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
21882199
uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
@@ -2192,25 +2203,23 @@ PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
21922203
else if (BlockFreq > MaxFreq)
21932204
MaxFreq = BlockFreq;
21942205
}
2195-
if (MinFreq) {
2196-
// Normalize to minimum observed frequency to avoid overflows when adding up
2197-
// frequencies.
2198-
for (uint64_t &Freq : Regions)
2199-
Freq /= MinFreq;
2200-
MaxFreq /= MinFreq;
2201-
}
2206+
if (!MinFreq)
2207+
return;
22022208

2203-
REMAT_DEBUG({
2204-
dbgs() << "Region frequencies\n";
2205-
for (auto [I, Freq] : enumerate(Regions)) {
2206-
dbgs() << REMAT_PREFIX << " [" << I << "] ";
2207-
if (Freq)
2208-
dbgs() << Freq;
2209-
else
2210-
dbgs() << "unknown ";
2211-
dbgs() << " | " << *DAG.Regions[I].first;
2212-
}
2213-
});
2209+
// Normalize to minimum observed frequency to avoid overflows when adding up
2210+
// frequencies.
2211+
for (uint64_t &Freq : Regions)
2212+
Freq /= MinFreq;
2213+
MaxFreq /= MinFreq;
2214+
2215+
// Compute the scaling factor for scoring frequency differences.
2216+
const uint64_t MaxDiff = MaxFreq - 1;
2217+
const uint64_t MaxReprFreqValue = (1 << FreqDiffWidth) - 1;
2218+
RescaleIsDenom = (2 * MaxDiff) & ~MaxReprFreqValue;
2219+
if (RescaleIsDenom)
2220+
RescaleFactor = (2 * MaxDiff) >> FreqDiffWidth;
2221+
else
2222+
RescaleFactor = MaxDiff ? MaxReprFreqValue / (2 * MaxDiff) : 1;
22142223
}
22152224

22162225
PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
@@ -2220,39 +2229,43 @@ PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
22202229

22212230
unsigned PreRARematStage::ScoredRemat::getNumRegs(
22222231
const GCNScheduleDAGMILive &DAG) const {
2223-
// FIXME: this doesn't account for the fact that the rematerialization may be
2224-
// for a subregister. In that case we will overestimate the number of
2225-
// registers involved. This is acceptable since this is purely used for the
2226-
// scoring heuristic, but we should find a way to compute the number of
2227-
// registers actually covered by the register/subregister pair.
22282232
const TargetRegisterClass &RC = *DAG.MRI.getRegClass(Remat->getReg());
2229-
return divideCeil(DAG.TRI->getRegSizeInBits(RC), 32);
2233+
unsigned RegSize = DAG.TRI->getRegSizeInBits(RC);
2234+
if (unsigned SubIdx = Remat->DefMI->getOperand(0).getSubReg()) {
2235+
// The following may return -1 (i.e., a large unsigned number) on indices
2236+
// that may be used to access subregisters of multiple sizes; in such cases
2237+
// fallback on the size derived from the register class.
2238+
unsigned SubRegSize = DAG.TRI->getSubRegIdxSize(SubIdx);
2239+
if (SubRegSize < RegSize)
2240+
RegSize = SubRegSize;
2241+
}
2242+
return divideCeil(RegSize, 32);
22302243
}
22312244

2232-
uint64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Info) const {
2245+
uint64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
22332246
// Get frequencies of defining and using regions. A rematerialization from the
22342247
// least frequent region to the most frequent region will yield the greatest
22352248
// latency penalty and therefore should get minimum score. Reciprocally, a
22362249
// rematerialization in the other direction should get maximum score. Default
22372250
// to values that will yield the worst possible score given known frequencies
22382251
// in order to penalize rematerializations from or into regions whose
22392252
// frequency is unknown.
2240-
uint64_t DefOrOne = Info.Regions[Remat->DefRegion];
2253+
uint64_t DefOrOne = Freq.Regions[Remat->DefRegion];
22412254
if (!DefOrOne)
22422255
DefOrOne = 1;
2243-
uint64_t UseOrMax = Info.Regions[Remat->UseRegion];
2256+
uint64_t UseOrMax = Freq.Regions[Remat->UseRegion];
22442257
if (!UseOrMax)
2245-
UseOrMax = Info.MaxFreq;
2258+
UseOrMax = Freq.MaxFreq;
22462259

22472260
// Maximum difference in frequency between defining and using regions.
2248-
const uint64_t MaxDiff = Info.MaxFreq - 1;
2249-
// This is equivalent to max( (2 * MaxDiff) / 2^NumBitsLatency , 1 ).
2250-
const uint64_t RescaleDenom =
2251-
std::max(MaxDiff >> (FreqDiffWidth - 1), (uint64_t)1);
2261+
const uint64_t MaxDiff = Freq.MaxFreq - 1;
22522262
// The difference between defining and using frequency is in the range
22532263
// [-MaxDiff, MaxDiff], shift it to [0,2 x MaxDiff] to stay in the positive
2254-
// range, then rescale to [0, 2^NumBitsLatency - 1]
2255-
return (MaxDiff + (DefOrOne - UseOrMax)) / RescaleDenom;
2264+
// range, then rescale to the representable range in the final score.
2265+
const uint64_t FreqDiff = (MaxDiff + (DefOrOne - UseOrMax));
2266+
if (Freq.RescaleIsDenom)
2267+
return FreqDiff / Freq.RescaleFactor;
2268+
return FreqDiff * Freq.RescaleFactor;
22562269
}
22572270

22582271
void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -524,7 +524,14 @@ class PreRARematStage : public GCNSchedStage {
524524
/// frequency. 0 when unknown.
525525
SmallVector<uint64_t> Regions;
526526
/// Maximum observed frequency, normalized to minimum observed frequency.
527-
uint64_t MaxFreq;
527+
uint64_t MaxFreq = 0;
528+
/// Rescaling factor for scoring frequency differences in the range [0, 2
529+
/// * (MaxFreq - 1)].
530+
uint64_t RescaleFactor = 0;
531+
/// Whether the rescaling factor should be used as a denominator (when the
532+
/// maximum frequency is "big") or as a nominator (when the maximum
533+
/// frequency is "small").
534+
bool RescaleIsDenom = false;
528535

529536
FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG);
530537
};
@@ -581,20 +588,29 @@ class PreRARematStage : public GCNSchedStage {
581588
void setMaxFreqScore(ScoreTy MaxFreq) {
582589
MaxFreq = std::min(
583590
static_cast<ScoreTy>(std::numeric_limits<uint32_t>::max()), MaxFreq);
584-
Score |= MaxFreq << (FreqDiffWidth + RegionImpactWidth);
591+
MaxFreq <<= FreqDiffWidth + RegionImpactWidth;
592+
593+
ScoreTy Mask = ((ScoreTy)1 << (FreqDiffWidth + RegionImpactWidth)) - 1;
594+
Score = MaxFreq | (Score & Mask);
585595
}
586596

587597
void setFreqDiffScore(ScoreTy FreqDiff) {
588598
FreqDiff = std::min(
589599
static_cast<ScoreTy>(std::numeric_limits<uint16_t>::max()), FreqDiff);
590-
Score |= FreqDiff << RegionImpactWidth;
600+
FreqDiff <<= RegionImpactWidth;
601+
602+
ScoreTy Mask = ((ScoreTy)1 << (FreqDiffWidth)) - 1;
603+
Mask <<= RegionImpactWidth;
604+
Score = FreqDiff | (Score & ~Mask);
591605
}
592606

593607
void setRegionImpactScore(ScoreTy RegionImpact) {
594608
RegionImpact =
595609
std::min(static_cast<ScoreTy>(std::numeric_limits<uint16_t>::max()),
596610
RegionImpact);
597-
Score |= RegionImpact;
611+
612+
ScoreTy Mask = ((ScoreTy)1 << (RegionImpactWidth)) - 1;
613+
Score = RegionImpact | (Score & ~Mask);
598614
}
599615

600616
unsigned getNumRegs(const GCNScheduleDAGMILive &DAG) const;

0 commit comments

Comments
 (0)