Address more feedback

lucas-rami · lucas-rami · commit c871117c2a8f · 2025-10-01T15:08:46.000Z
- Use SchedModel instead of instruction itinerary
- Don't use getNumConvertedRegs to get number of regs, use RC instead.
- Clarify some comments.
- Other minor changes.
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1225,7 +1225,7 @@ bool PreRARematStage::initGCNSchedStage() {
                          << "] REMAT (always) | " << *Remat.DefMI);
       rematerialize(Remat, RecomputeRP);
     } else {
-      ScoredRemats.emplace_back(&Remat, DAG.ST, *DAG.TII);
+      ScoredRemats.emplace_back(&Remat, DAG);
     }
   }
   unsetSatisifedRPTargets(RescheduleRegions);
@@ -1286,9 +1286,12 @@ bool PreRARematStage::initGCNSchedStage() {
       REMAT_DEBUG(dbgs() << "[" << MIRegion[Remat.DefMI] << "] REMAT *" << Score
                          << "* | " << *Remat.DefMI);
       MachineInstr *RematMI = rematerialize(Remat, RecomputeRP);
-      // Every rematerialization done with the objective of increasing occupancy
-      // increases latency. If we don't manage to increase occupancy, we want to
-      // roll them back.
+      // Every rematerialization we do here is likely to move the instruction
+      // into a higher frequency region, increasing the total sum latency of the
+      // instruction itself. This is acceptable if we are eliminating a spill in
+      // the process, but when the goal is increasing occupancy we get nothing
+      // out of rematerialization if occupancy is not increased in the end; in
+      // such cases we want to roll back the rematerialization.
       if (TargetOcc)
         Rollbackable.push_back({RematMI, &Remat});
       unsetSatisifedRPTargets(Remat.Live);
@@ -1315,7 +1318,7 @@ bool PreRARematStage::initGCNSchedStage() {
     DAG.Pressure[I] = RP;
     unsigned NewRegionOcc = RP.getOccupancy(ST, DynamicVGPRBlockSize);
     AchievedOcc = std::min(AchievedOcc, NewRegionOcc);
-    REMAT_DEBUG(dbgs() << "[" << I << "] Achieved occupancy " << NewRegionOcc
+    REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy " << NewRegionOcc
                        << " (" << RPTargets[I] << ")\n");
   }
 
@@ -1878,9 +1881,9 @@ bool PreRARematStage::collectRematRegs(ArrayRef<uint64_t> RegionFreq) {
   // regions containing rematerializable instructions.
   DAG.RegionLiveOuts.buildLiveRegMap();
 
-  // Set of registers already marked for potential remterialization; used for
-  // remat chains checks.
-  DenseSet<Register> RematRegSet;
+  // Set of registers already marked for potential remterialization; used to
+  // avoid rematerialization chains.
+  SmallSet<Register, 4> RematRegSet;
   auto IsMORematable = [&RematRegSet](const MachineOperand &MO) -> bool {
     return MO.isReg() && RematRegSet.contains(MO.getReg());
   };
@@ -1979,15 +1982,45 @@ PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
   return NewMI;
 }
 
+unsigned PreRARematStage::ScoredRemat::getNumRegs(
+    const GCNScheduleDAGMILive &DAG) const {
+  // FIXME: this doesn't account for the fact that the rematerialization may be
+  // for a subregister. In that case we will overestimate the number of
+  // registers involved. This is acceptable since this is purely used for the
+  // scoring heuristic, but we should find a way to compute the number of
+  // registers actually covered by the register/subregister pair.
+  Register Reg = Remat->DefMI->getOperand(0).getReg();
+  const TargetRegisterClass &RC = *DAG.MRI.getRegClass(Reg);
+  return divideCeil(DAG.TRI->getRegSizeInBits(RC), 32);
+}
+
+unsigned PreRARematStage::ScoredRemat::getLatencyGain(
+    const GCNScheduleDAGMILive &DAG) const {
+  if (hasUnknownLatencyGain())
+    return 0;
+
+  const TargetSchedModel &SchedModel = DAG.ST.getInstrInfo()->getSchedModel();
+
+  // Rematerializing the register to its using region changes the number of
+  // times we will execute it in total.
+  unsigned FreqDiff = Remat->UseFrequency - Remat->DefFrequency;
+  int RematLatDiff = FreqDiff * SchedModel.computeInstrLatency(Remat->DefMI);
+
+  // We assume that spilling the register means we have to insert a save in its
+  // defining region and a restore in its using region. Spill instruction
+  // opcodes do not have corresponding scheduling models so we cannot accurately
+  // estimate their latency. Since this is just meant as a heuristic, use the
+  // default high latency from the MC scheduling model.
+  int SpillLatDiff = SchedModel.getMCSchedModel()->DefaultHighLatency *
+                     (Remat->DefFrequency + Remat->UseFrequency);
+
+  return SpillLatDiff - RematLatDiff;
+}
+
 PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
-                                          const GCNSubtarget &ST,
-                                          const TargetInstrInfo &TII)
-    : Remat(Remat) {
-  const InstrItineraryData *Itin = ST.getInstrItineraryData();
-  if (Remat->DefFrequency && Remat->UseFrequency) {
-    InstrLatencyGain = Remat->DefFrequency - Remat->UseFrequency;
-    *InstrLatencyGain *= TII.getInstrLatency(Itin, *Remat->DefMI);
-  }
+                                          const GCNScheduleDAGMILive &DAG)
+    : Remat(Remat), NumRegs(getNumRegs(DAG)),
+      RematLatencyGainOverSpill(getLatencyGain(DAG)) {
   resetScore();
 }
 
@@ -2011,20 +2044,13 @@ void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
   // we get by increasing occupancy and compare it to the latency hit each wave
   // will be subjected to.
   if (ReduceSpill) {
-    // It may be better to let the register spill if it is defined by a very
-    // high latency instruction. Try to estimate the latency gain induced by
-    // rematerializing the register.
-    //
-    // If we don't know the rematerializations's latency gain we don't know
-    // what to compare the spill latency against. We still consider the
-    // rematerialization potentially beneficial in such cases because we don't
-    // want to miss rematerialization opportunities and rematerializing is in
-    // most cases cheaper than spilling. We still give a bonus to remats for
-    // which we are able to do the calculation.
-    if (InstrLatencyGain && *InstrLatencyGain < 0) {
-      int SpillLatencyGain = SaveCost * Remat->DefFrequency;
-      SpillLatencyGain += RestoreCost * Remat->UseFrequency;
-      if (*InstrLatencyGain + SpillLatencyGain < 0)
+    // If we don't know the latency gain, we still consider the
+    // rematerialization potentially beneficial because we don't want to miss
+    // rematerialization opportunities and rematerializing is in most cases
+    // cheaper than spilling. We still give a bonus to remats for which we are
+    // able to do the calculation.
+    if (!hasUnknownLatencyGain()) {
+      if (RematLatencyGainOverSpill < 0)
         return setUselessRemat();
       setKnownLatencyGain();
     }
@@ -2033,7 +2059,7 @@ void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
   // The estimated RP reduction is proportional to the total frequency in target
   // regions where the register is live.
   Register Reg = Remat->DefMI->getOperand(0).getReg();
-  unsigned RPScore = 0;
+  ScoreTy RPScore = 0;
   for (unsigned I : TargetRegions.set_bits()) {
     unsigned Freq = std::max(RegionFreq[I], static_cast<uint64_t>(1));
     if (Remat->isBeneficialRegion(I))
@@ -2044,7 +2070,7 @@ void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
 
   // The estimated RP reduction is directly proportional to the size of the
   // rematerializable register.
-  setRPScore(RPScore * SIRegisterInfo::getNumCoveredRegs(Remat->Mask));
+  setRPScore(RPScore * NumRegs);
 }
 
 MachineInstr *PreRARematStage::rematerialize(const RematReg &Remat,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -441,6 +441,19 @@ class ClusteredLowOccStage : public GCNSchedStage {
 /// reducing spilling or increasing occupancy is possible, it tries to
 /// rematerialize as few registers as possible to reduce potential negative
 /// effects on function latency.
+///
+/// The stage only supports rematerializing registers that meet all of the
+/// following constraints.
+/// 1. The register is virtual and has a single defining instruction.
+/// 2. The single defining instruction is either deemed rematerializable by the
+///    target-independent logic, or if not, has no non-constant and
+///    non-ignorable physical register use.
+/// 3  The register has no virtual register use whose live range would be
+///    extended by the rematerialization.
+/// 4. The register has a single non-debug user in a different region from its
+///    defining region.
+/// 5. The register is not used by or using another register that is going to be
+///    rematerialized.
 class PreRARematStage : public GCNSchedStage {
 private:
   /// Groups information about a rematerializable register.
@@ -520,8 +533,7 @@ class PreRARematStage : public GCNSchedStage {
 
     /// This only initializes state-independent characteristics of \p Remat, not
     /// the actual score.
-    ScoredRemat(const RematReg *Remat, const GCNSubtarget &ST,
-                const TargetInstrInfo &TII);
+    ScoredRemat(const RematReg *Remat, const GCNScheduleDAGMILive &DAG);
 
     /// Updates the rematerialization's score w.r.t. the current \p RPTargets.
     /// \p RegionFreq indicates the frequency of each region
@@ -540,19 +552,22 @@ class PreRARematStage : public GCNSchedStage {
     }
 
   private:
-    /// Estimated save/restore latency costs for spilling a register to stack.
-    /// FIXME: These numbers are very arbitrary. Need a good rationale for them,
-    /// which I don't know where to get from.
-    static constexpr int SaveCost = 100, RestoreCost = 100;
     /// Per-region contribution weights to RP score depending on whether RP is
     /// guaranteed or only likely to be reduced in the region. Only their
     /// relative value w.r.t. one another matter.
     static constexpr int WeightRP = 10, WeightRPMaybe = 5;
 
-    /// Latency gain induced by rematerializing the instruction. Does not
-    /// include estimated spilling cost of *not* rematerializing (save/restore
-    /// to/from stack).
-    std::optional<int> InstrLatencyGain = std::nullopt;
+    /// Number of 32-bit registers this rematerialization covers.
+    const unsigned NumRegs;
+    /// Latency gain induced by rematerializing the register over spilling its
+    /// defining instruction.
+    const int RematLatencyGainOverSpill;
+
+    /// Whether we can estimate the latency gain of rematerialazing over
+    /// spilling; this requires knowing defining/using region frequencies.
+    bool hasUnknownLatencyGain() const {
+      return !Remat->DefFrequency || !Remat->UseFrequency;
+    }
 
     using ScoreTy = int32_t;
     /// Overall rematerialization score. Scoring components are mapped to bit
@@ -568,9 +583,11 @@ class PreRARematStage : public GCNSchedStage {
 
     void setKnownLatencyGain() { Score |= 1; }
 
-    void setRPScore(unsigned RPScore) {
-      Score |= static_cast<ScoreTy>(RPScore) << 1;
-    }
+    void setRPScore(ScoreTy RPScore) { Score |= RPScore << 1; }
+
+    unsigned getNumRegs(const GCNScheduleDAGMILive &DAG) const;
+
+    unsigned getLatencyGain(const GCNScheduleDAGMILive &DAG) const;
   };
 
   /// Maps all MIs (except lone terminators, which are not part of any region)