jrbyrnes
diff --git a/‎llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp‎
Lines changed: 113 additions & 44 deletions b/‎llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp‎
Lines changed: 113 additions & 44 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/GCNSchedStrategy.h‎
Lines changed: 7 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/GCNSchedStrategy.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/GCNSubtarget.cpp‎
Lines changed: 5 additions & 15 deletions b/‎llvm/lib/Target/AMDGPU/GCNSubtarget.cpp‎
Lines changed: 5 additions & 15 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/GCNSubtarget.h‎
Lines changed: 6 additions & 22 deletions b/‎llvm/lib/Target/AMDGPU/GCNSubtarget.h‎
Lines changed: 6 additions & 22 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp‎
Lines changed: 13 additions & 27 deletions b/‎llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp‎
Lines changed: 13 additions & 27 deletions
@@ -1675,6 +1675,41 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
   return true;
 }
 
+bool PreRARematStage::hasExcessVGPRs(const GCNRegPressure &RP,
+                                     unsigned MaxVGPRs,
+                                     unsigned &ExcessArchVGPRs,
+                                     bool &AGPRLimited) {
+  unsigned NumAGPRs = RP.getAGPRNum();
+  if (!ST.hasGFX90AInsts() || !NumAGPRs) {
+    // Non-unified RF. We can only reduce ArchVGPR excess pressure at this
+    // point, but still want to identify when there is AGPR excess pressure.
+    bool HasSpill = false;
+    unsigned NumArchVGPRs = RP.getArchVGPRNum();
+    if (NumArchVGPRs > MaxVGPRs) {
+      ExcessArchVGPRs = NumArchVGPRs - MaxVGPRs;
+      HasSpill = true;
+    }
+    if (NumAGPRs > MaxVGPRs) {
+      AGPRLimited = true;
+      HasSpill = true;
+    }
+    return HasSpill;
+  }
+  if (RP.getVGPRNum(true) > MaxVGPRs) {
+    // Unified RF. We can only remat ArchVGPRs; AGPR pressure alone may prevent
+    // us from eliminating spilling.
+    unsigned NumArchVGPRs = RP.getArchVGPRNum();
+    if (NumAGPRs >= MaxVGPRs) {
+      AGPRLimited = true;
+      ExcessArchVGPRs = NumArchVGPRs;
+    } else {
+      ExcessArchVGPRs = NumArchVGPRs - alignDown(MaxVGPRs - NumAGPRs, 4);
+    }
+    return true;
+  }
+  return false;
+}
+
 bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
 
@@ -1684,51 +1719,86 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
   // occupancy, or regions with VGPR spilling) to their excess RP.
   DenseMap<unsigned, unsigned> OptRegions;
-
-  // Note that the maximum number of VGPRs to use to eliminate spill may be
-  // lower than the maximum number to increase occupancy when the function has
-  // the "amdgpu-num-vgpr" attribute.
-  const std::pair<unsigned, unsigned> OccBounds =
+  const Function &F = MF.getFunction();
+  const bool UnifiedRF = ST.hasGFX90AInsts();
+
+  // Adjust workgroup size induced occupancy bounds with the
+  // "amdgpu-waves-per-eu" attribute. This should be offloaded to a subtarget
+  // method, but at this point is if unclear how other parts of the codebase
+  // interpret this attribute and the default behavior produces unexpected
+  // bounds. Here we want to allow users to ask for target occupancies lower
+  // than the default lower bound.
+  std::pair<unsigned, unsigned> OccBounds =
       ST.getOccupancyWithWorkGroupSizes(MF);
-  // FIXME: we should be able to just call ST.getMaxNumArchVGPRs() but that
-  // would use the occupancy bounds as determined by
-  // MF.getFunction().getWavesPerEU(), which look incorrect in some cases.
+  std::pair<unsigned, unsigned> WavesPerEU =
+      AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
+  if (WavesPerEU.first <= WavesPerEU.second) {
+    if (WavesPerEU.first && WavesPerEU.first <= OccBounds.second)
+      OccBounds.first = WavesPerEU.first;
+    if (WavesPerEU.second)
+      OccBounds.second = std::min(OccBounds.second, WavesPerEU.second);
+  }
+
+  // We call the "base max functions" directly because otherwise it uses the
+  // subtarget's logic for combining "amdgpu-waves-per-eu" with the function's
+  // groupsize induced occupancy bounds, producing unexpected results.
+  const unsigned MaxSGPRsNoSpill = ST.getBaseMaxNumSGPRs(
+      F, OccBounds, ST.getMaxNumPreloadedSGPRs(), ST.getReservedNumSGPRs(F));
   const unsigned MaxVGPRsNoSpill =
-      ST.getBaseMaxNumVGPRs(MF.getFunction(),
-                            {ST.getMinNumArchVGPRs(OccBounds.second),
-                             ST.getMaxNumArchVGPRs(OccBounds.first)},
-                            false);
-  const unsigned MaxVGPRsIncOcc = ST.getMaxNumArchVGPRs(DAG.MinOccupancy + 1);
+      ST.getBaseMaxNumVGPRs(F, {ST.getMinNumVGPRs(OccBounds.second),
+                                ST.getMaxNumVGPRs(OccBounds.first)});
+  const unsigned MaxSGPRsMinOcc = ST.getMaxNumSGPRs(DAG.MinOccupancy, false);
+  const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
   IncreaseOccupancy = OccBounds.second > DAG.MinOccupancy;
 
+  auto ClearOptRegionsIf = [&](bool Cond) -> bool {
+    if (Cond) {
+      // We won't try to increase occupancy.
+      IncreaseOccupancy = false;
+      OptRegions.clear();
+    }
+    return Cond;
+  };
+
   // Collect optimizable regions. If there is spilling in any region we will
-  // just try to reduce it. Otherwise we will try to increase occupancy by one.
+  // just try to reduce ArchVGPR spilling. Otherwise we will try to increase
+  // occupancy by one in the whole function.
   for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
     GCNRegPressure &RP = DAG.Pressure[I];
-    unsigned NumVGPRs = RP.getArchVGPRNum();
     unsigned ExcessRP = 0;
-    if (NumVGPRs > MaxVGPRsNoSpill) {
-      if (IncreaseOccupancy) {
-        // We won't try to increase occupancy.
-        IncreaseOccupancy = false;
-        OptRegions.clear();
-      }
-      // Region has VGPR spilling, we will try to reduce spilling as much as
-      // possible.
-      ExcessRP = NumVGPRs - MaxVGPRsNoSpill;
-      REMAT_DEBUG(dbgs() << "Region " << I << " is spilling VGPRs, save "
-                         << ExcessRP << " VGPR(s) to eliminate spilling\n");
+    unsigned NumSGPRs = RP.getSGPRNum();
+
+    // Check whether SGPR pressures prevents us from eliminating spilling.
+    if (NumSGPRs > MaxSGPRsNoSpill)
+      ClearOptRegionsIf(IncreaseOccupancy);
+
+    bool OccAGPRLimited = false;
+    if (hasExcessVGPRs(RP, MaxVGPRsNoSpill, ExcessRP, OccAGPRLimited)) {
+      ClearOptRegionsIf(IncreaseOccupancy);
+      REMAT_DEBUG({
+        if (ExcessRP) {
+          StringRef RegClass = UnifiedRF ? "VGPRs" : "ArchVGPRs";
+          dbgs() << "Region " << I << " is spilling " << RegClass << ", save "
+                 << ExcessRP << " to eliminate " << RegClass << "-spilling\n";
+        }
+      });
     } else if (IncreaseOccupancy) {
-      if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) == DAG.MinOccupancy) {
-        // Occupancy is SGPR-limited in the region, no point in trying to
-        // increase it through VGPR usage.
-        IncreaseOccupancy = false;
-        OptRegions.clear();
-      } else if (NumVGPRs > MaxVGPRsIncOcc) {
-        // Occupancy is VGPR-limited.
-        ExcessRP = NumVGPRs - MaxVGPRsIncOcc;
-        REMAT_DEBUG(dbgs() << "Region " << I << " has min. occupancy: save "
-                           << ExcessRP << " VGPR(s) to improve occupancy\n");
+      // Check whether SGPR pressure prevents us from increasing occupancy.
+      if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsMinOcc))
+        continue;
+
+      if (hasExcessVGPRs(RP, MaxVGPRsIncOcc, ExcessRP, OccAGPRLimited)) {
+        // Check whether AGPR pressure prevents us from increasing occupancy.
+        if (ClearOptRegionsIf(OccAGPRLimited))
+          continue;
+        // Occupancy could be increased by rematerializing ArchVGPRs.
+        REMAT_DEBUG({
+          if (ExcessRP) {
+            StringRef RegClass = UnifiedRF ? "VGPRs" : "ArchVGPRs";
+            dbgs() << "Region " << I << " has min. occupancy: save " << ExcessRP
+                   << " " << RegClass << " to improve occupancy\n";
+          }
+        });
       }
     }
     if (ExcessRP)
@@ -1738,7 +1808,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
     return false;
 
   // When we are reducing spilling, the target is the minimum achievable
-  // occupancy implied by workgroup sizes.
+  // occupancy implied by workgroup sizes / the "amdgpu-waves-per-eu" attribute.
   TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : OccBounds.first;
 
   // Accounts for a reduction in RP in an optimizable region. Returns whether we
@@ -2058,9 +2128,11 @@ static MachineBasicBlock *getRegionMBB(MachineFunction &MF,
 void PreRARematStage::finalizeGCNSchedStage() {
   // We consider that reducing spilling is always beneficial so we never
   // rollback rematerializations in such cases. It's also possible that
-  // rescheduling lowers occupancy over the one achived just through remats, in
-  // which case we do not want to rollback either.
-  if (!IncreaseOccupancy || AchievedOcc == TargetOcc)
+  // rescheduling lowers occupancy over the one achieved just through remats, in
+  // which case we do not want to rollback either (the rescheduling was already
+  // reverted in PreRARematStage::shouldRevertScheduling in such cases).
+  unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
+  if (!IncreaseOccupancy || MaxOcc == TargetOcc)
     return;
 
   REMAT_DEBUG(dbgs() << "Rollbacking all rematerializations\n");
@@ -2077,10 +2149,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
 
     // Re-rematerialize MI at the end of its original region. Note that it may
     // not be rematerialized exactly in the same position as originally within
-    // the region, but it should not matter much. Since we are only
-    // rematerializing instructions that do not have any virtual reg uses, we
-    // do not need to call LiveRangeEdit::allUsesAvailableAt() and
-    // LiveRangeEdit::canRematerializeAt().
+    // the region, but it should not matter much.
     TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
     MachineInstr *NewMI = &*std::prev(InsertPos);
     NewMI->getOperand(0).setSubReg(SubReg);
 
@@ -485,6 +485,13 @@ class PreRARematStage : public GCNSchedStage {
   /// spilling.
   bool IncreaseOccupancy;
 
+  /// Determines whether there is excess VGPR pressure in \p RP w.r.t. \p
+  /// MaxVGPRs. If there is, \p ExcessArchVGPRs is set to the number of
+  /// ArchVGPRs one must save to eliminate the excess and \p AGPRLimited is set
+  /// to true if AGPR pressure alone causes an excess.
+  bool hasExcessVGPRs(const GCNRegPressure &RP, unsigned MaxVGPRs,
+                      unsigned &ExcessArchVGPRs, bool &AGPRLimited);
+
   /// Returns whether remat can reduce spilling or increase function occupancy
   /// by 1 through rematerialization. If it can do one, collects instructions in
   /// PreRARematStage::Rematerializations and sets the target occupancy in
 
@@ -466,7 +466,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
                             getReservedNumSGPRs(MF));
 }
 
-static unsigned getMaxNumPreloadedSGPRs() {
+unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
   using USI = GCNUserSGPRUsageInfo;
   // Max number of user SGPRs
   const unsigned MaxUserSGPRs =
@@ -496,10 +496,8 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
                             getReservedNumSGPRs(F));
 }
 
-unsigned
-GCNSubtarget::getBaseMaxNumVGPRs(const Function &F,
-                                 std::pair<unsigned, unsigned> NumVGPRBounds,
-                                 bool UnifiedRF) const {
+unsigned GCNSubtarget::getBaseMaxNumVGPRs(
+    const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
   const auto &[Min, Max] = NumVGPRBounds;
 
   // Check if maximum number of VGPRs was explicitly requested using
@@ -510,7 +508,7 @@ GCNSubtarget::getBaseMaxNumVGPRs(const Function &F,
   unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
   if (!Requested)
     return Max;
-  if (UnifiedRF)
+  if (hasGFX90AInsts())
     Requested *= 2;
 
   // Make sure requested value is inside the range of possible VGPR usage.
@@ -520,15 +518,7 @@ GCNSubtarget::getBaseMaxNumVGPRs(const Function &F,
 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
   std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
   return getBaseMaxNumVGPRs(
-      F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)},
-      hasGFX90AInsts());
-}
-
-unsigned GCNSubtarget::getMaxNumArchVGPRs(const Function &F) const {
-  std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
-  return getBaseMaxNumVGPRs(
-      F, {getMinNumArchVGPRs(Waves.second), getMaxNumArchVGPRs(Waves.first)},
-      false);
+      F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)});
 }
 
 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
 
@@ -1487,6 +1487,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// \returns Reserved number of SGPRs for given function \p F.
   unsigned getReservedNumSGPRs(const Function &F) const;
 
+  /// \returns Maximum number of preloaded SGPRs for the subtarget.
+  unsigned getMaxNumPreloadedSGPRs() const;
+
   /// \returns max num SGPRs. This is the common utility
   /// function called by MachineFunction and Function
   /// variants of getMaxNumSGPRs.
@@ -1547,29 +1550,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
   }
 
-  /// \returns the minimum number of ArchVGPRs that will prevent achieving more
-  /// than the specified number of waves \p WavesPerEU.
-  unsigned getMinNumArchVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMinNumArchVGPRs(this, WavesPerEU);
-  }
-
   /// \returns the maximum number of VGPRs that can be used and still achieved
   /// at least the specified number of waves \p WavesPerEU.
   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
   }
 
-  /// \returns the maximum number of ArchVGPRs that can be used and still
-  /// achieve at least the specified number of waves \p WavesPerEU.
-  unsigned getMaxNumArchVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMaxNumArchVGPRs(this, WavesPerEU);
-  }
-
-  /// \returns max num VGPRs. This is the common utility function called by
-  /// MachineFunction and Function variants of getMaxNum[Arch]VGPRs.
+  /// \returns max num VGPRs. This is the common utility function
+  /// called by MachineFunction and Function variants of getMaxNumVGPRs.
   unsigned getBaseMaxNumVGPRs(const Function &F,
-                              std::pair<unsigned, unsigned> NumVGPRBounds,
-                              bool UnifiedRF) const;
+                              std::pair<unsigned, unsigned> NumVGPRBounds) const;
 
   /// \returns Maximum number of VGPRs that meets number of waves per execution
   /// unit requirement for function \p F, or number of VGPRs explicitly
@@ -1581,12 +1571,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// unit requirement.
   unsigned getMaxNumVGPRs(const Function &F) const;
 
-  /// Returns the maximum number of ArchVGPRs that meets number of waves per
-  /// execution unit requirement for function \p F, or number of ArchVGPRs
-  /// explicitly requested using "amdgpu-num-vgpr" attribute attached to
-  /// function \p F.
-  unsigned getMaxNumArchVGPRs(const Function &F) const;
-
   unsigned getMaxNumAGPRs(const Function &F) const {
     return getMaxNumVGPRs(F);
   }
 
@@ -1240,36 +1240,28 @@ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
   return 5;
 }
 
-unsigned getBaseMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
-                            unsigned TotalNumVGPRs,
-                            unsigned NumAddressableVGPRs) {
+unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
   assert(WavesPerEU != 0);
+
   unsigned MaxWavesPerEU = getMaxWavesPerEU(STI);
   if (WavesPerEU >= MaxWavesPerEU)
     return 0;
-  unsigned MinWavesPerEU =
-      getNumWavesPerEUWithNumVGPRs(STI, NumAddressableVGPRs);
-  if (WavesPerEU < MinWavesPerEU)
-    return getMinNumVGPRs(STI, MinWavesPerEU);
 
+  unsigned TotNumVGPRs = getTotalNumVGPRs(STI);
+  unsigned AddrsableNumVGPRs = getAddressableNumVGPRs(STI);
   unsigned Granule = getVGPRAllocGranule(STI);
-  unsigned MaxNumVGPRs = alignDown(TotalNumVGPRs / WavesPerEU, Granule);
-  if (MaxNumVGPRs == alignDown(TotalNumVGPRs / MaxWavesPerEU, Granule))
+  unsigned MaxNumVGPRs = alignDown(TotNumVGPRs / WavesPerEU, Granule);
+
+  if (MaxNumVGPRs == alignDown(TotNumVGPRs / MaxWavesPerEU, Granule))
     return 0;
-  unsigned MaxNumVGPRsNext =
-      alignDown(TotalNumVGPRs / (WavesPerEU + 1), Granule);
-  unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
-  return std::min(MinNumVGPRs, NumAddressableVGPRs);
-}
 
-unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
-  return getBaseMinNumVGPRs(STI, WavesPerEU, getTotalNumVGPRs(STI),
-                            getAddressableNumVGPRs(STI));
-}
+  unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs);
+  if (WavesPerEU < MinWavesPerEU)
+    return getMinNumVGPRs(STI, MinWavesPerEU);
 
-unsigned getMinNumArchVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
-  unsigned TotNumArchVGPRs = getAddressableNumArchVGPRs(STI);
-  return getBaseMinNumVGPRs(STI, WavesPerEU, TotNumArchVGPRs, TotNumArchVGPRs);
+  unsigned MaxNumVGPRsNext = alignDown(TotNumVGPRs / (WavesPerEU + 1), Granule);
+  unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
+  return std::min(MinNumVGPRs, AddrsableNumVGPRs);
 }
 
 unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
@@ -1281,12 +1273,6 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
   return std::min(MaxNumVGPRs, AddressableNumVGPRs);
 }
 
-unsigned getMaxNumArchVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
-  assert(WavesPerEU != 0);
-  return alignDown(getAddressableNumArchVGPRs(STI) / WavesPerEU,
-                   getVGPRAllocGranule(STI));
-}
-
 unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,
                                  std::optional<bool> EnableWavefrontSize32) {
   return getGranulatedNumRegisterBlocks(