@@ -1204,6 +1204,18 @@ bool PreRARematStage::initGCNSchedStage() {
12041204 });
12051205
12061206 const ScoredRemat::FreqInfo FreqInfo (MF, DAG);
1207+ REMAT_DEBUG ({
1208+ dbgs () << " Region frequencies\n " ;
1209+ for (auto [I, Freq] : enumerate(FreqInfo.Regions )) {
1210+ dbgs () << REMAT_PREFIX << " [" << I << " ] " ;
1211+ if (Freq)
1212+ dbgs () << Freq;
1213+ else
1214+ dbgs () << " unknown " ;
1215+ dbgs () << " | " << *DAG.Regions [I].first ;
1216+ }
1217+ });
1218+
12071219 SmallVector<ScoredRemat> ScoredRemats;
12081220 for (const RematReg &Remat : RematRegs)
12091221 ScoredRemats.emplace_back (&Remat, FreqInfo, DAG);
@@ -1982,7 +1994,6 @@ PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
19821994 const unsigned NumRegions = DAG.Regions .size ();
19831995 uint64_t MinFreq = MBFI.getEntryFreq ().getFrequency ();
19841996 Regions.reserve (NumRegions);
1985- MaxFreq = 0 ;
19861997 for (unsigned I = 0 ; I < NumRegions; ++I) {
19871998 MachineBasicBlock *MBB = DAG.Regions [I].first ->getParent ();
19881999 uint64_t BlockFreq = MBFI.getBlockFreq (MBB).getFrequency ();
@@ -1992,25 +2003,23 @@ PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
19922003 else if (BlockFreq > MaxFreq)
19932004 MaxFreq = BlockFreq;
19942005 }
1995- if (MinFreq) {
1996- // Normalize to minimum observed frequency to avoid overflows when adding up
1997- // frequencies.
1998- for (uint64_t &Freq : Regions)
1999- Freq /= MinFreq;
2000- MaxFreq /= MinFreq;
2001- }
2006+ if (!MinFreq)
2007+ return ;
20022008
2003- REMAT_DEBUG ({
2004- dbgs () << " Region frequencies\n " ;
2005- for (auto [I, Freq] : enumerate(Regions)) {
2006- dbgs () << REMAT_PREFIX << " [" << I << " ] " ;
2007- if (Freq)
2008- dbgs () << Freq;
2009- else
2010- dbgs () << " unknown " ;
2011- dbgs () << " | " << *DAG.Regions [I].first ;
2012- }
2013- });
2009+ // Normalize to minimum observed frequency to avoid overflows when adding up
2010+ // frequencies.
2011+ for (uint64_t &Freq : Regions)
2012+ Freq /= MinFreq;
2013+ MaxFreq /= MinFreq;
2014+
2015+ // Compute the scaling factor for scoring frequency differences.
2016+ const uint64_t MaxDiff = MaxFreq - 1 ;
2017+ const uint64_t MaxReprFreqValue = (1 << FreqDiffWidth) - 1 ;
2018+ RescaleIsDenom = (2 * MaxDiff) & ~MaxReprFreqValue;
2019+ if (RescaleIsDenom)
2020+ RescaleFactor = (2 * MaxDiff) >> FreqDiffWidth;
2021+ else
2022+ RescaleFactor = MaxDiff ? MaxReprFreqValue / (2 * MaxDiff) : 1 ;
20142023}
20152024
20162025PreRARematStage::ScoredRemat::ScoredRemat (const RematReg *Remat,
@@ -2020,39 +2029,43 @@ PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
20202029
20212030unsigned PreRARematStage::ScoredRemat::getNumRegs (
20222031 const GCNScheduleDAGMILive &DAG) const {
2023- // FIXME: this doesn't account for the fact that the rematerialization may be
2024- // for a subregister. In that case we will overestimate the number of
2025- // registers involved. This is acceptable since this is purely used for the
2026- // scoring heuristic, but we should find a way to compute the number of
2027- // registers actually covered by the register/subregister pair.
20282032 const TargetRegisterClass &RC = *DAG.MRI .getRegClass (Remat->getReg ());
2029- return divideCeil (DAG.TRI ->getRegSizeInBits (RC), 32 );
2033+ unsigned RegSize = DAG.TRI ->getRegSizeInBits (RC);
2034+ if (unsigned SubIdx = Remat->DefMI ->getOperand (0 ).getSubReg ()) {
2035+ // The following may return -1 (i.e., a large unsigned number) on indices
2036+ // that may be used to access subregisters of multiple sizes; in such cases
2037+ // fallback on the size derived from the register class.
2038+ unsigned SubRegSize = DAG.TRI ->getSubRegIdxSize (SubIdx);
2039+ if (SubRegSize < RegSize)
2040+ RegSize = SubRegSize;
2041+ }
2042+ return divideCeil (RegSize, 32 );
20302043}
20312044
2032- uint64_t PreRARematStage::ScoredRemat::getFreqDiff (const FreqInfo &Info ) const {
2045+ uint64_t PreRARematStage::ScoredRemat::getFreqDiff (const FreqInfo &Freq ) const {
20332046 // Get frequencies of defining and using regions. A rematerialization from the
20342047 // least frequent region to the most frequent region will yield the greatest
20352048 // latency penalty and therefore should get minimum score. Reciprocally, a
20362049 // rematerialization in the other direction should get maximum score. Default
20372050 // to values that will yield the worst possible score given known frequencies
20382051 // in order to penalize rematerializations from or into regions whose
20392052 // frequency is unknown.
2040- uint64_t DefOrOne = Info .Regions [Remat->DefRegion ];
2053+ uint64_t DefOrOne = Freq .Regions [Remat->DefRegion ];
20412054 if (!DefOrOne)
20422055 DefOrOne = 1 ;
2043- uint64_t UseOrMax = Info .Regions [Remat->UseRegion ];
2056+ uint64_t UseOrMax = Freq .Regions [Remat->UseRegion ];
20442057 if (!UseOrMax)
2045- UseOrMax = Info .MaxFreq ;
2058+ UseOrMax = Freq .MaxFreq ;
20462059
20472060 // Maximum difference in frequency between defining and using regions.
2048- const uint64_t MaxDiff = Info.MaxFreq - 1 ;
2049- // This is equivalent to max( (2 * MaxDiff) / 2^NumBitsLatency , 1 ).
2050- const uint64_t RescaleDenom =
2051- std::max (MaxDiff >> (FreqDiffWidth - 1 ), (uint64_t )1 );
2061+ const uint64_t MaxDiff = Freq.MaxFreq - 1 ;
20522062 // The difference between defining and using frequency is in the range
20532063 // [-MaxDiff, MaxDiff], shift it to [0,2 x MaxDiff] to stay in the positive
2054- // range, then rescale to [0, 2^NumBitsLatency - 1]
2055- return (MaxDiff + (DefOrOne - UseOrMax)) / RescaleDenom;
2064+ // range, then rescale to the representable range in the final score.
2065+ const uint64_t FreqDiff = (MaxDiff + (DefOrOne - UseOrMax));
2066+ if (Freq.RescaleIsDenom )
2067+ return FreqDiff / Freq.RescaleFactor ;
2068+ return FreqDiff * Freq.RescaleFactor ;
20562069}
20572070
20582071void PreRARematStage::ScoredRemat::update (const BitVector &TargetRegions,
0 commit comments