@@ -1225,7 +1225,7 @@ bool PreRARematStage::initGCNSchedStage() {
12251225 << " ] REMAT (always) | " << *Remat.DefMI );
12261226 rematerialize (Remat, RecomputeRP);
12271227 } else {
1228- ScoredRemats.emplace_back (&Remat, DAG. ST , *DAG. TII );
1228+ ScoredRemats.emplace_back (&Remat, DAG);
12291229 }
12301230 }
12311231 unsetSatisifedRPTargets (RescheduleRegions);
@@ -1286,9 +1286,12 @@ bool PreRARematStage::initGCNSchedStage() {
12861286 REMAT_DEBUG (dbgs () << " [" << MIRegion[Remat.DefMI ] << " ] REMAT *" << Score
12871287 << " * | " << *Remat.DefMI );
12881288 MachineInstr *RematMI = rematerialize (Remat, RecomputeRP);
1289- // Every rematerialization done with the objective of increasing occupancy
1290- // increases latency. If we don't manage to increase occupancy, we want to
1291- // roll them back.
1289+ // Every rematerialization we do here is likely to move the instruction
1290+ // into a higher frequency region, increasing the total sum latency of the
1291+ // instruction itself. This is acceptable if we are eliminating a spill in
1292+ // the process, but when the goal is increasing occupancy we get nothing
1293+ // out of rematerialization if occupancy is not increased in the end; in
1294+ // such cases we want to roll back the rematerialization.
12921295 if (TargetOcc)
12931296 Rollbackable.push_back ({RematMI, &Remat});
12941297 unsetSatisifedRPTargets (Remat.Live );
@@ -1315,7 +1318,7 @@ bool PreRARematStage::initGCNSchedStage() {
13151318 DAG.Pressure [I] = RP;
13161319 unsigned NewRegionOcc = RP.getOccupancy (ST, DynamicVGPRBlockSize);
13171320 AchievedOcc = std::min (AchievedOcc, NewRegionOcc);
1318- REMAT_DEBUG (dbgs () << " [ " << I << " ] Achieved occupancy " << NewRegionOcc
1321+ REMAT_DEBUG (dbgs () << ' [ ' << I << " ] Achieved occupancy " << NewRegionOcc
13191322 << " (" << RPTargets[I] << " )\n " );
13201323 }
13211324
@@ -1878,9 +1881,9 @@ bool PreRARematStage::collectRematRegs(ArrayRef<uint64_t> RegionFreq) {
18781881 // regions containing rematerializable instructions.
18791882 DAG.RegionLiveOuts .buildLiveRegMap ();
18801883
1881- // Set of registers already marked for potential remterialization; used for
1882- // remat chains checks .
1883- DenseSet <Register> RematRegSet;
1884+ // Set of registers already marked for potential remterialization; used to
1885+ // avoid rematerialization chains .
1886+ SmallSet <Register, 4 > RematRegSet;
18841887 auto IsMORematable = [&RematRegSet](const MachineOperand &MO) -> bool {
18851888 return MO.isReg () && RematRegSet.contains (MO.getReg ());
18861889 };
@@ -1979,15 +1982,45 @@ PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
19791982 return NewMI;
19801983}
19811984
1985+ unsigned PreRARematStage::ScoredRemat::getNumRegs (
1986+ const GCNScheduleDAGMILive &DAG) const {
1987+ // FIXME: this doesn't account for the fact that the rematerialization may be
1988+ // for a subregister. In that case we will overestimate the number of
1989+ // registers involved. This is acceptable since this is purely used for the
1990+ // scoring heuristic, but we should find a way to compute the number of
1991+ // registers actually covered by the register/subregister pair.
1992+ Register Reg = Remat->DefMI ->getOperand (0 ).getReg ();
1993+ const TargetRegisterClass &RC = *DAG.MRI .getRegClass (Reg);
1994+ return divideCeil (DAG.TRI ->getRegSizeInBits (RC), 32 );
1995+ }
1996+
1997+ unsigned PreRARematStage::ScoredRemat::getLatencyGain (
1998+ const GCNScheduleDAGMILive &DAG) const {
1999+ if (hasUnknownLatencyGain ())
2000+ return 0 ;
2001+
2002+ const TargetSchedModel &SchedModel = DAG.ST .getInstrInfo ()->getSchedModel ();
2003+
2004+ // Rematerializing the register to its using region changes the number of
2005+ // times we will execute it in total.
2006+ unsigned FreqDiff = Remat->UseFrequency - Remat->DefFrequency ;
2007+ int RematLatDiff = FreqDiff * SchedModel.computeInstrLatency (Remat->DefMI );
2008+
2009+ // We assume that spilling the register means we have to insert a save in its
2010+ // defining region and a restore in its using region. Spill instruction
2011+ // opcodes do not have corresponding scheduling models so we cannot accurately
2012+ // estimate their latency. Since this is just meant as a heuristic, use the
2013+ // default high latency from the MC scheduling model.
2014+ int SpillLatDiff = SchedModel.getMCSchedModel ()->DefaultHighLatency *
2015+ (Remat->DefFrequency + Remat->UseFrequency );
2016+
2017+ return SpillLatDiff - RematLatDiff;
2018+ }
2019+
19822020PreRARematStage::ScoredRemat::ScoredRemat (const RematReg *Remat,
1983- const GCNSubtarget &ST,
1984- const TargetInstrInfo &TII)
1985- : Remat(Remat) {
1986- const InstrItineraryData *Itin = ST.getInstrItineraryData ();
1987- if (Remat->DefFrequency && Remat->UseFrequency ) {
1988- InstrLatencyGain = Remat->DefFrequency - Remat->UseFrequency ;
1989- *InstrLatencyGain *= TII.getInstrLatency (Itin, *Remat->DefMI );
1990- }
2021+ const GCNScheduleDAGMILive &DAG)
2022+ : Remat(Remat), NumRegs(getNumRegs(DAG)),
2023+ RematLatencyGainOverSpill(getLatencyGain(DAG)) {
19912024 resetScore ();
19922025}
19932026
@@ -2011,20 +2044,13 @@ void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
20112044 // we get by increasing occupancy and compare it to the latency hit each wave
20122045 // will be subjected to.
20132046 if (ReduceSpill) {
2014- // It may be better to let the register spill if it is defined by a very
2015- // high latency instruction. Try to estimate the latency gain induced by
2016- // rematerializing the register.
2017- //
2018- // If we don't know the rematerializations's latency gain we don't know
2019- // what to compare the spill latency against. We still consider the
2020- // rematerialization potentially beneficial in such cases because we don't
2021- // want to miss rematerialization opportunities and rematerializing is in
2022- // most cases cheaper than spilling. We still give a bonus to remats for
2023- // which we are able to do the calculation.
2024- if (InstrLatencyGain && *InstrLatencyGain < 0 ) {
2025- int SpillLatencyGain = SaveCost * Remat->DefFrequency ;
2026- SpillLatencyGain += RestoreCost * Remat->UseFrequency ;
2027- if (*InstrLatencyGain + SpillLatencyGain < 0 )
2047+ // If we don't know the latency gain, we still consider the
2048+ // rematerialization potentially beneficial because we don't want to miss
2049+ // rematerialization opportunities and rematerializing is in most cases
2050+ // cheaper than spilling. We still give a bonus to remats for which we are
2051+ // able to do the calculation.
2052+ if (!hasUnknownLatencyGain ()) {
2053+ if (RematLatencyGainOverSpill < 0 )
20282054 return setUselessRemat ();
20292055 setKnownLatencyGain ();
20302056 }
@@ -2033,7 +2059,7 @@ void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
20332059 // The estimated RP reduction is proportional to the total frequency in target
20342060 // regions where the register is live.
20352061 Register Reg = Remat->DefMI ->getOperand (0 ).getReg ();
2036- unsigned RPScore = 0 ;
2062+ ScoreTy RPScore = 0 ;
20372063 for (unsigned I : TargetRegions.set_bits ()) {
20382064 unsigned Freq = std::max (RegionFreq[I], static_cast <uint64_t >(1 ));
20392065 if (Remat->isBeneficialRegion (I))
@@ -2044,7 +2070,7 @@ void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
20442070
20452071 // The estimated RP reduction is directly proportional to the size of the
20462072 // rematerializable register.
2047- setRPScore (RPScore * SIRegisterInfo::getNumCoveredRegs (Remat-> Mask ) );
2073+ setRPScore (RPScore * NumRegs );
20482074}
20492075
20502076MachineInstr *PreRARematStage::rematerialize (const RematReg &Remat,
0 commit comments