@@ -1383,6 +1383,18 @@ bool PreRARematStage::initGCNSchedStage() {
13831383 });
13841384
13851385 const ScoredRemat::FreqInfo FreqInfo (MF, DAG);
1386+ REMAT_DEBUG ({
1387+ dbgs () << " Region frequencies\n " ;
1388+ for (auto [I, Freq] : enumerate(FreqInfo.Regions )) {
1389+ dbgs () << REMAT_PREFIX << " [" << I << " ] " ;
1390+ if (Freq)
1391+ dbgs () << Freq;
1392+ else
1393+ dbgs () << " unknown " ;
1394+ dbgs () << " | " << *DAG.Regions [I].first ;
1395+ }
1396+ });
1397+
13861398 SmallVector<ScoredRemat> ScoredRemats;
13871399 for (const RematReg &Remat : RematRegs)
13881400 ScoredRemats.emplace_back (&Remat, FreqInfo, DAG);
@@ -2182,7 +2194,6 @@ PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
21822194 const unsigned NumRegions = DAG.Regions .size ();
21832195 uint64_t MinFreq = MBFI.getEntryFreq ().getFrequency ();
21842196 Regions.reserve (NumRegions);
2185- MaxFreq = 0 ;
21862197 for (unsigned I = 0 ; I < NumRegions; ++I) {
21872198 MachineBasicBlock *MBB = DAG.Regions [I].first ->getParent ();
21882199 uint64_t BlockFreq = MBFI.getBlockFreq (MBB).getFrequency ();
@@ -2192,25 +2203,23 @@ PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
21922203 else if (BlockFreq > MaxFreq)
21932204 MaxFreq = BlockFreq;
21942205 }
2195- if (MinFreq) {
2196- // Normalize to minimum observed frequency to avoid overflows when adding up
2197- // frequencies.
2198- for (uint64_t &Freq : Regions)
2199- Freq /= MinFreq;
2200- MaxFreq /= MinFreq;
2201- }
2206+ if (!MinFreq)
2207+ return ;
22022208
2203- REMAT_DEBUG ({
2204- dbgs () << " Region frequencies\n " ;
2205- for (auto [I, Freq] : enumerate(Regions)) {
2206- dbgs () << REMAT_PREFIX << " [" << I << " ] " ;
2207- if (Freq)
2208- dbgs () << Freq;
2209- else
2210- dbgs () << " unknown " ;
2211- dbgs () << " | " << *DAG.Regions [I].first ;
2212- }
2213- });
2209+ // Normalize to minimum observed frequency to avoid overflows when adding up
2210+ // frequencies.
2211+ for (uint64_t &Freq : Regions)
2212+ Freq /= MinFreq;
2213+ MaxFreq /= MinFreq;
2214+
2215+ // Compute the scaling factor for scoring frequency differences.
2216+ const uint64_t MaxDiff = MaxFreq - 1 ;
2217+ const uint64_t MaxReprFreqValue = (1 << FreqDiffWidth) - 1 ;
2218+ RescaleIsDenom = (2 * MaxDiff) & ~MaxReprFreqValue;
2219+ if (RescaleIsDenom)
2220+ RescaleFactor = (2 * MaxDiff) >> FreqDiffWidth;
2221+ else
2222+ RescaleFactor = MaxDiff ? MaxReprFreqValue / (2 * MaxDiff) : 1 ;
22142223}
22152224
22162225PreRARematStage::ScoredRemat::ScoredRemat (const RematReg *Remat,
@@ -2220,39 +2229,43 @@ PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
22202229
22212230unsigned PreRARematStage::ScoredRemat::getNumRegs (
22222231 const GCNScheduleDAGMILive &DAG) const {
2223- // FIXME: this doesn't account for the fact that the rematerialization may be
2224- // for a subregister. In that case we will overestimate the number of
2225- // registers involved. This is acceptable since this is purely used for the
2226- // scoring heuristic, but we should find a way to compute the number of
2227- // registers actually covered by the register/subregister pair.
22282232 const TargetRegisterClass &RC = *DAG.MRI .getRegClass (Remat->getReg ());
2229- return divideCeil (DAG.TRI ->getRegSizeInBits (RC), 32 );
2233+ unsigned RegSize = DAG.TRI ->getRegSizeInBits (RC);
2234+ if (unsigned SubIdx = Remat->DefMI ->getOperand (0 ).getSubReg ()) {
2235+ // The following may return -1 (i.e., a large unsigned number) on indices
2236+ // that may be used to access subregisters of multiple sizes; in such cases
2237+ // fallback on the size derived from the register class.
2238+ unsigned SubRegSize = DAG.TRI ->getSubRegIdxSize (SubIdx);
2239+ if (SubRegSize < RegSize)
2240+ RegSize = SubRegSize;
2241+ }
2242+ return divideCeil (RegSize, 32 );
22302243}
22312244
2232- uint64_t PreRARematStage::ScoredRemat::getFreqDiff (const FreqInfo &Info ) const {
2245+ uint64_t PreRARematStage::ScoredRemat::getFreqDiff (const FreqInfo &Freq ) const {
22332246 // Get frequencies of defining and using regions. A rematerialization from the
22342247 // least frequent region to the most frequent region will yield the greatest
22352248 // latency penalty and therefore should get minimum score. Reciprocally, a
22362249 // rematerialization in the other direction should get maximum score. Default
22372250 // to values that will yield the worst possible score given known frequencies
22382251 // in order to penalize rematerializations from or into regions whose
22392252 // frequency is unknown.
2240- uint64_t DefOrOne = Info .Regions [Remat->DefRegion ];
2253+ uint64_t DefOrOne = Freq .Regions [Remat->DefRegion ];
22412254 if (!DefOrOne)
22422255 DefOrOne = 1 ;
2243- uint64_t UseOrMax = Info .Regions [Remat->UseRegion ];
2256+ uint64_t UseOrMax = Freq .Regions [Remat->UseRegion ];
22442257 if (!UseOrMax)
2245- UseOrMax = Info .MaxFreq ;
2258+ UseOrMax = Freq .MaxFreq ;
22462259
22472260 // Maximum difference in frequency between defining and using regions.
2248- const uint64_t MaxDiff = Info.MaxFreq - 1 ;
2249- // This is equivalent to max( (2 * MaxDiff) / 2^NumBitsLatency , 1 ).
2250- const uint64_t RescaleDenom =
2251- std::max (MaxDiff >> (FreqDiffWidth - 1 ), (uint64_t )1 );
2261+ const uint64_t MaxDiff = Freq.MaxFreq - 1 ;
22522262 // The difference between defining and using frequency is in the range
22532263 // [-MaxDiff, MaxDiff], shift it to [0,2 x MaxDiff] to stay in the positive
2254- // range, then rescale to [0, 2^NumBitsLatency - 1]
2255- return (MaxDiff + (DefOrOne - UseOrMax)) / RescaleDenom;
2264+ // range, then rescale to the representable range in the final score.
2265+ const uint64_t FreqDiff = (MaxDiff + (DefOrOne - UseOrMax));
2266+ if (Freq.RescaleIsDenom )
2267+ return FreqDiff / Freq.RescaleFactor ;
2268+ return FreqDiff * Freq.RescaleFactor ;
22562269}
22572270
22582271void PreRARematStage::ScoredRemat::update (const BitVector &TargetRegions,
0 commit comments