Skip to content

Commit 76a4111

Browse files
committed
Improve code for frequency-based calculations
- Add some explanatory comments for the calculations. - Normalize frequencies to minimum to avoid overflows. - Create helper struct to hold all frequency information (helpful for further improvements).
1 parent d452fa9 commit 76a4111

File tree

2 files changed

+95
-69
lines changed

2 files changed

+95
-69
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 74 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1193,34 +1193,7 @@ bool PreRARematStage::initGCNSchedStage() {
11931193
printTargetRegions(/*PrintAll=*/TargetRegions.none());
11941194
});
11951195

1196-
// Compute region frequencies. 0 encodes an unknown region frequency.
1197-
SmallVector<uint64_t> RegionFreq;
1198-
RegionFreq.reserve(NumRegions);
1199-
assert(DAG.MLI && "MLI not defined in DAG");
1200-
MachineBranchProbabilityInfo MBPI;
1201-
MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
1202-
uint64_t MinFreq = MBFI.getEntryFreq().getFrequency(), MaxFreq = 0;
1203-
for (const MachineBasicBlock *MBB : RegionBB) {
1204-
uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
1205-
RegionFreq.push_back(BlockFreq);
1206-
if (BlockFreq < MinFreq)
1207-
MinFreq = BlockFreq;
1208-
else if (BlockFreq > MaxFreq)
1209-
MaxFreq = BlockFreq;
1210-
}
1211-
REMAT_DEBUG({
1212-
dbgs() << "Region frequencies:\n";
1213-
for (auto [I, Freq] : enumerate(RegionFreq)) {
1214-
dbgs() << REMAT_PREFIX << " [" << I << "] ";
1215-
if (Freq)
1216-
dbgs() << Freq;
1217-
else
1218-
dbgs() << "unknown ";
1219-
dbgs() << " | " << *DAG.Regions[I].first;
1220-
}
1221-
});
1222-
1223-
if (!collectRematRegs(MIRegion, RegionFreq)) {
1196+
if (!collectRematRegs(MIRegion)) {
12241197
REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
12251198
return false;
12261199
}
@@ -1230,20 +1203,21 @@ bool PreRARematStage::initGCNSchedStage() {
12301203
Remat.print();
12311204
});
12321205

1206+
const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
12331207
SmallVector<ScoredRemat> ScoredRemats;
12341208
for (const RematReg &Remat : RematRegs)
1235-
ScoredRemats.emplace_back(&Remat, MinFreq, MaxFreq, DAG);
1236-
BitVector RecomputeRP(NumRegions);
1209+
ScoredRemats.emplace_back(&Remat, FreqInfo, DAG);
12371210

12381211
// Rematerialize registers in successive rounds until all RP targets are
12391212
// satisifed or until we run out of rematerialization candidates.
12401213
#ifndef NDEBUG
12411214
unsigned RoundNum = 0;
12421215
#endif
1216+
BitVector RecomputeRP(NumRegions);
12431217
do {
12441218
// (Re-)Score and (re-)sort all remats in increasing score order.
12451219
for (ScoredRemat &Remat : ScoredRemats)
1246-
Remat.update(TargetRegions, RPTargets, RegionFreq, !TargetOcc);
1220+
Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
12471221
sort(ScoredRemats);
12481222

12491223
REMAT_DEBUG({
@@ -1885,10 +1859,7 @@ bool PreRARematStage::setObjective() {
18851859
}
18861860

18871861
bool PreRARematStage::collectRematRegs(
1888-
const DenseMap<MachineInstr *, unsigned> &MIRegion,
1889-
ArrayRef<uint64_t> RegionFreq) {
1890-
assert(RegionFreq.size() == DAG.Regions.size());
1891-
1862+
const DenseMap<MachineInstr *, unsigned> &MIRegion) {
18921863
// We need up-to-date live-out info. to query live-out register masks in
18931864
// regions containing rematerializable instructions.
18941865
DAG.RegionLiveOuts.buildLiveRegMap();
@@ -1948,7 +1919,7 @@ bool PreRARematStage::collectRematRegs(
19481919

19491920
// Add the instruction to the rematerializable list.
19501921
RematRegSet.insert(Reg);
1951-
RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion, RegionFreq);
1922+
RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion);
19521923
}
19531924
}
19541925

@@ -1957,12 +1928,10 @@ bool PreRARematStage::collectRematRegs(
19571928

19581929
PreRARematStage::RematReg::RematReg(
19591930
MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG,
1960-
const DenseMap<MachineInstr *, unsigned> &MIRegion,
1961-
ArrayRef<uint64_t> RegionFreq)
1931+
const DenseMap<MachineInstr *, unsigned> &MIRegion)
19621932
: DefMI(DefMI), UseMI(UseMI), LiveIn(DAG.Regions.size()),
19631933
LiveOut(DAG.Regions.size()), Live(DAG.Regions.size()),
1964-
DefRegion(MIRegion.at(DefMI)), UseRegion(MIRegion.at(UseMI)),
1965-
DefFrequency(RegionFreq[DefRegion]), UseFrequency(RegionFreq[UseRegion]) {
1934+
DefRegion(MIRegion.at(DefMI)), UseRegion(MIRegion.at(UseMI)) {
19661935

19671936
// Mark regions in which the rematerializable register is live.
19681937
Register Reg = getReg();
@@ -2004,11 +1973,50 @@ void PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
20041973
DAG.LIS->createAndComputeVirtRegInterval(RematMI->getOperand(0).getReg());
20051974
}
20061975

1976+
PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
1977+
MachineFunction &MF, const GCNScheduleDAGMILive &DAG) {
1978+
assert(DAG.MLI && "MLI not defined in DAG");
1979+
MachineBranchProbabilityInfo MBPI;
1980+
MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
1981+
1982+
const unsigned NumRegions = DAG.Regions.size();
1983+
uint64_t MinFreq = MBFI.getEntryFreq().getFrequency();
1984+
Regions.reserve(NumRegions);
1985+
MaxFreq = 0;
1986+
for (unsigned I = 0; I < NumRegions; ++I) {
1987+
MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
1988+
uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
1989+
Regions.push_back(BlockFreq);
1990+
if (BlockFreq && BlockFreq < MinFreq)
1991+
MinFreq = BlockFreq;
1992+
else if (BlockFreq > MaxFreq)
1993+
MaxFreq = BlockFreq;
1994+
}
1995+
if (MinFreq) {
1996+
// Normalize to minimum observed frequency to avoid overflows when adding up
1997+
// frequencies.
1998+
for (uint64_t &Freq : Regions)
1999+
Freq /= MinFreq;
2000+
MaxFreq /= MinFreq;
2001+
}
2002+
2003+
REMAT_DEBUG({
2004+
dbgs() << "Region frequencies\n";
2005+
for (auto [I, Freq] : enumerate(Regions)) {
2006+
dbgs() << REMAT_PREFIX << " [" << I << "] ";
2007+
if (Freq)
2008+
dbgs() << Freq;
2009+
else
2010+
dbgs() << "unknown ";
2011+
dbgs() << " | " << *DAG.Regions[I].first;
2012+
}
2013+
});
2014+
}
2015+
20072016
PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
2008-
uint64_t MinFreq, uint64_t MaxFreq,
2017+
const FreqInfo &Freq,
20092018
const GCNScheduleDAGMILive &DAG)
2010-
: Remat(Remat), NumRegs(getNumRegs(DAG)),
2011-
FreqDiff(getFreqDiff(MinFreq, MaxFreq)) {}
2019+
: Remat(Remat), NumRegs(getNumRegs(DAG)), FreqDiff(getFreqDiff(Freq)) {}
20122020

20132021
unsigned PreRARematStage::ScoredRemat::getNumRegs(
20142022
const GCNScheduleDAGMILive &DAG) const {
@@ -2021,23 +2029,35 @@ unsigned PreRARematStage::ScoredRemat::getNumRegs(
20212029
return divideCeil(DAG.TRI->getRegSizeInBits(RC), 32);
20222030
}
20232031

2024-
uint64_t PreRARematStage::ScoredRemat::getFreqDiff(uint64_t MinFreq,
2025-
uint64_t MaxFreq) const {
2026-
uint64_t DefOrMin = Remat->DefFrequency ? Remat->DefFrequency : MinFreq;
2027-
uint64_t UseOrMax = Remat->UseFrequency ? Remat->UseFrequency : MaxFreq;
2028-
uint64_t MaxDiff = MaxFreq - MinFreq;
2029-
// This is equivalent to (2 * MaxDiff) / 2^NumBitsLatency.
2030-
uint64_t RescaleDenom = MaxDiff >> (FreqDiffWidth - 1);
2031-
RescaleDenom = std::max(RescaleDenom, (uint64_t)1);
2032+
uint64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Info) const {
2033+
// Get frequencies of defining and using regions. A rematerialization from the
2034+
// least frequent region to the most frequent region will yield the greatest
2035+
// latency penalty and therefore should get minimum score. Reciprocally, a
2036+
// rematerialization in the other direction should get maximum score. Default
2037+
// to values that will yield the worst possible score given known frequencies
2038+
// in order to penalize rematerializations from or into regions whose
2039+
// frequency is unknown
2040+
uint64_t DefOrOne = Info.Regions[Remat->DefRegion];
2041+
if (!DefOrOne)
2042+
DefOrOne = 1;
2043+
uint64_t UseOrMax = Info.Regions[Remat->UseRegion];
2044+
if (!UseOrMax)
2045+
UseOrMax = Info.MaxFreq;
2046+
2047+
// Maximum difference in frequency between defining and using regions.
2048+
const uint64_t MaxDiff = Info.MaxFreq - 1;
2049+
// This is equivalent to max( (2 * MaxDiff) / 2^NumBitsLatency , 1 ).
2050+
const uint64_t RescaleDenom =
2051+
std::max(MaxDiff >> (FreqDiffWidth - 1), (uint64_t)1);
20322052
// The difference between defining and using frequency is in the range
20332053
// [-MaxDiff, MaxDiff], shift it to [0,2 x MaxDiff] to stay in the positive
20342054
// range, then rescale to [0, 2^NumBitsLatency - 1]
2035-
return (MaxDiff + (DefOrMin - UseOrMax)) / RescaleDenom;
2055+
return (MaxDiff + (DefOrOne - UseOrMax)) / RescaleDenom;
20362056
}
20372057

20382058
void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
20392059
ArrayRef<GCNRPTarget> RPTargets,
2040-
ArrayRef<uint64_t> RegionFreq,
2060+
const FreqInfo &FreqInfo,
20412061
bool ReduceSpill) {
20422062
setNullScore();
20432063
if (!Remat->maybeBeneficial(TargetRegions, RPTargets))
@@ -2055,7 +2075,7 @@ void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
20552075
NumBenefitingRegions += UnusedLT ? 2 : 1;
20562076

20572077
if (ReduceSpill) {
2058-
uint64_t Freq = RegionFreq[I];
2078+
uint64_t Freq = FreqInfo.Regions[I];
20592079
if (!UnusedLT) {
20602080
// Apply a frequency penalty in regions in which we are not sure that RP
20612081
// will decrease.

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ class ClusteredLowOccStage : public GCNSchedStage {
448448
/// rematerialized.
449449
class PreRARematStage : public GCNSchedStage {
450450
private:
451-
/// Groups information about a rematerializable register.
451+
/// A rematerializable register.
452452
struct RematReg {
453453
/// Single MI defining the rematerializable register.
454454
MachineInstr *DefMI;
@@ -460,13 +460,10 @@ class PreRARematStage : public GCNSchedStage {
460460
LaneBitmask Mask;
461461
/// Defining and using regions.
462462
unsigned DefRegion, UseRegion;
463-
/// Frequency of defining/using regions. 0 when unknown.
464-
uint64_t DefFrequency, UseFrequency;
465463

466464
RematReg(MachineInstr *DefMI, MachineInstr *UseMI,
467465
GCNScheduleDAGMILive &DAG,
468-
const DenseMap<MachineInstr *, unsigned> &MIRegion,
469-
ArrayRef<uint64_t> RegionFreq);
466+
const DenseMap<MachineInstr *, unsigned> &MIRegion);
470467

471468
/// Returns the rematerializable register. Do not call after deleting the
472469
/// original defining instruction.
@@ -502,15 +499,26 @@ class PreRARematStage : public GCNSchedStage {
502499
/// The rematerializable register under consideration.
503500
const RematReg *Remat;
504501

502+
/// Execution frequency information required by scoring heuristics.
503+
struct FreqInfo {
504+
/// Per-region execution frequencies, normalized to minimum observed
505+
/// frequency. 0 when unknown.
506+
SmallVector<uint64_t> Regions;
507+
/// Maximum observed frequency, normalized to minimum observed frequency.
508+
uint64_t MaxFreq;
509+
510+
FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG);
511+
};
512+
505513
/// This only initializes state-independent characteristics of \p Remat, not
506514
/// the actual score.
507-
ScoredRemat(const RematReg *Remat, uint64_t MinFreq, uint64_t MaxFreq,
515+
ScoredRemat(const RematReg *Remat, const FreqInfo &Freq,
508516
const GCNScheduleDAGMILive &DAG);
509517

510518
/// Updates the rematerialization's score w.r.t. the current \p RPTargets.
511519
/// \p RegionFreq indicates the frequency of each region
512520
void update(const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets,
513-
ArrayRef<uint64_t> RegionFreq, bool ReduceSpill);
521+
const FreqInfo &Freq, bool ReduceSpill);
514522

515523
/// Returns whether the current score is null.
516524
bool hasNullScore() const { return !Score; }
@@ -541,8 +549,8 @@ class PreRARematStage : public GCNSchedStage {
541549
const uint64_t FreqDiff;
542550

543551
using ScoreTy = uint64_t;
544-
/// Overall rematerialization score. Scoring components are mapped to bit
545-
/// ranges in the overall score.
552+
/// Rematerialization score. Scoring components are mapped to bit ranges in
553+
/// the score.
546554
///
547555
/// [63:32] : maximum frequency in benefiting target region (spilling only)
548556
/// [31:16] : frequency difference between defining and using region
@@ -572,7 +580,7 @@ class PreRARematStage : public GCNSchedStage {
572580

573581
unsigned getNumRegs(const GCNScheduleDAGMILive &DAG) const;
574582

575-
uint64_t getFreqDiff(uint64_t MinFreq, uint64_t MaxFreq) const;
583+
uint64_t getFreqDiff(const FreqInfo &Freq) const;
576584
};
577585

578586
/// Holds enough information to rollback a rematerialization decision post
@@ -624,11 +632,9 @@ class PreRARematStage : public GCNSchedStage {
624632
bool updateAndVerifyRPTargets(const BitVector &Regions);
625633

626634
/// Collects all rematerializable registers and appends them to \ref
627-
/// RematRegs. \p MIRegion maps MIs to their region and \p RegionFreq contains
628-
/// the frequency of each region, 0 indicating an unknown frequency. Returns
629-
/// whether any rematerializable register was found.
630-
bool collectRematRegs(const DenseMap<MachineInstr *, unsigned> &MIRegion,
631-
ArrayRef<uint64_t> RegionFreq);
635+
/// RematRegs. \p MIRegion maps MIs to their region. Returns whether any
636+
/// rematerializable register was found.
637+
bool collectRematRegs(const DenseMap<MachineInstr *, unsigned> &MIRegion);
632638

633639
/// Rematerializes \p Remat. This removes the rematerialized register from
634640
/// live-in/out lists in the DAG and updates RP targets in all affected

0 commit comments

Comments
 (0)