-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU][Scheduler] Consistent occupancy calculation during rematerialization #149224
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Lucas Ramirez (lucas-rami) ChangesThe scheduler's rematerialization stage uses This makes the whole rematerialization stage use occupancy estimates influenced by I disabled combined VGPR savings in Patch is 92.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149224.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 7d6723a6108be..f3f6c5e845fc6 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -14,6 +14,7 @@
#include "GCNRegPressure.h"
#include "AMDGPU.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
using namespace llvm;
@@ -41,6 +42,18 @@ unsigned GCNRegPressure::getRegKind(const TargetRegisterClass *RC,
return STI->isSGPRClass(RC) ? SGPR : (STI->isAGPRClass(RC) ? AGPR : VGPR);
}
+unsigned GCNRegPressure::getOccupancy(const GCNSubtarget &ST,
+ unsigned DynamicVGPRBlockSize,
+ bool BalanceVGPRUsage) const {
+ const bool UnifiedRF = ST.hasGFX90AInsts();
+ unsigned NumVGPRs = (!UnifiedRF && BalanceVGPRUsage)
+ ? divideCeil(getArchVGPRNum() + getAGPRNum(), 2)
+ : getVGPRNum(UnifiedRF);
+
+ return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
+ ST.getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize));
+}
+
void GCNRegPressure::inc(unsigned Reg,
LaneBitmask PrevMask,
LaneBitmask NewMask,
@@ -366,31 +379,33 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
+ : MF(MF), RP(RP) {
const Function &F = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- setRegLimits(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), MF);
+ setTarget(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), CombineVGPRSavings);
}
GCNRPTarget::GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs,
const MachineFunction &MF, const GCNRegPressure &RP,
bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
- setRegLimits(NumSGPRs, NumVGPRs, MF);
+ : MF(MF), RP(RP) {
+ setTarget(NumSGPRs, NumVGPRs, CombineVGPRSavings);
}
GCNRPTarget::GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
const GCNRegPressure &RP, bool CombineVGPRSavings)
- : RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
+ : MF(MF), RP(RP) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
- setRegLimits(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
- ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize), MF);
+ setTarget(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
+ ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize),
+ CombineVGPRSavings);
}
-void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs,
- const MachineFunction &MF) {
+void GCNRPTarget::setTarget(unsigned NumSGPRs, unsigned NumVGPRs,
+ bool CombineVGPRSavings) {
+ this->CombineVGPRSavings = CombineVGPRSavings;
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
@@ -402,8 +417,8 @@ void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs,
: 0;
}
-bool GCNRPTarget::isSaveBeneficial(Register Reg,
- const MachineRegisterInfo &MRI) const {
+bool GCNRPTarget::isSaveBeneficial(Register Reg) const {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 3749b6d1efc63..57295a9d15724 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -18,6 +18,8 @@
#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include <algorithm>
@@ -69,12 +71,12 @@ struct GCNRegPressure {
}
unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
- unsigned getOccupancy(const GCNSubtarget &ST,
- unsigned DynamicVGPRBlockSize) const {
- return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
- ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
- DynamicVGPRBlockSize));
- }
+ /// Determines the occupancy achievable with the current RP, when \p
+ /// BalanceVGPRUsage is true on subtargets with non-unified RFs, the
+ /// occupancy w.r.t. the number of VGPRs is computed as if we will later be
+ /// able to evenly balance out VGPR usage among ArchVGPR and AGPR banks.
+ unsigned getOccupancy(const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize,
+ bool BalanceVGPRUsage = false) const;
void inc(unsigned Reg,
LaneBitmask PrevMask,
@@ -187,22 +189,34 @@ class GCNRPTarget {
GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
const GCNRegPressure &RP, bool CombineVGPRSavings = false);
+ /// Changes the target (same semantics as constructor).
+ void setTarget(unsigned NumSGPRs, unsigned NumVGPRs,
+ bool CombineVGPRSavings = false);
+
const GCNRegPressure &getCurrentRP() const { return RP; }
void setRP(const GCNRegPressure &NewRP) { RP = NewRP; }
/// Determines whether saving virtual register \p Reg will be beneficial
/// towards achieving the RP target.
- bool isSaveBeneficial(Register Reg, const MachineRegisterInfo &MRI) const;
+ bool isSaveBeneficial(Register Reg) const;
/// Saves virtual register \p Reg with lanemask \p Mask.
- void saveReg(Register Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI) {
- RP.inc(Reg, Mask, LaneBitmask::getNone(), MRI);
+ void saveReg(Register Reg, LaneBitmask Mask) {
+ RP.inc(Reg, Mask, LaneBitmask::getNone(), MF.getRegInfo());
}
/// Whether the current RP is at or below the defined pressure target.
bool satisfied() const;
+ /// Computes achievable occupancy with the currently tracked register pressure.
+ unsigned getOccupancy() const {
+ return RP.getOccupancy(
+ MF.getSubtarget<GCNSubtarget>(),
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(),
+ /*BalanceVGPRUsage=*/CombineVGPRSavings);
+ }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
friend raw_ostream &operator<<(raw_ostream &OS, const GCNRPTarget &Target) {
OS << "Actual/Target: " << Target.RP.getSGPRNum() << '/' << Target.MaxSGPRs
@@ -222,6 +236,8 @@ class GCNRPTarget {
#endif
private:
+ const MachineFunction &MF;
+
/// Current register pressure.
GCNRegPressure RP;
@@ -234,7 +250,7 @@ class GCNRPTarget {
unsigned MaxUnifiedVGPRs;
/// Whether we consider that the register allocator will be able to swap
/// between ArchVGPRs and AGPRs by copying them to a super register class.
- /// Concretely, this allows savings in one of the VGPR banks to help toward
+ /// Concretely, this allows free registers in one VGPR bank to help toward
/// savings in the other VGPR bank.
bool CombineVGPRSavings;
@@ -252,9 +268,6 @@ class GCNRPTarget {
return NumVGPRs > MaxVGPRs || !satisfiesUnifiedTarget() ||
(CombineVGPRSavings && !satisifiesVGPRBanksTarget());
}
-
- void setRegLimits(unsigned MaxSGPRs, unsigned MaxVGPRs,
- const MachineFunction &MF);
};
///////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index fce8f36d45969..13fd7ac19a607 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -68,6 +68,13 @@ static cl::opt<bool> GCNTrackers(
cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
cl::init(false));
+static cl::opt<bool> RematCombineVGPRSavings(
+ "amdgpu-schedule-combine-vgpr-savings", cl::Hidden,
+ cl::desc("Combine ArchVGPR/AGPR savings when computing occupancy estimates "
+ "during rematerialization stage. Set to false for generally more "
+ "aggressive rematerialization."),
+ cl::init(true));
+
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -1116,10 +1123,14 @@ bool PreRARematStage::initGCNSchedStage() {
rematerialize();
if (GCNTrackers)
DAG.RegionLiveOuts.buildLiveRegMap();
- REMAT_DEBUG(
- dbgs() << "Retrying function scheduling with new min. occupancy of "
- << AchievedOcc << " from rematerializing (original was "
- << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
+ REMAT_DEBUG({
+ dbgs() << "Retrying function scheduling with new min. occupancy of "
+ << AchievedOcc << " from rematerializing";
+ if (TargetOcc)
+ dbgs() << ", target was " << *TargetOcc;
+ dbgs() << '\n';
+ });
+
if (AchievedOcc > DAG.MinOccupancy) {
DAG.MinOccupancy = AchievedOcc;
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -1549,9 +1560,19 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
}
bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
- return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
- mayCauseSpilling(WavesAfter) ||
- (IncreaseOccupancy && WavesAfter < TargetOcc);
+ if (mayCauseSpilling(WavesAfter))
+ return true;
+ if (!TargetOcc) {
+ // Never revert when trying to reduce spilling and RP did not increase in
+ // the region.
+ return false;
+ }
+
+ // When trying to increase occupancy, only revert if we failed to achieve the
+ // target occupancy. Compute occupancy using the RP target instead of the
+ // default RP logic to be consistent with the rest of the stage.
+ return PressureAfter.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize(),
+ RematCombineVGPRSavings) < *TargetOcc;
}
bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
@@ -1700,78 +1721,79 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
}
bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
- REMAT_DEBUG({
- dbgs() << "Collecting rematerializable instructions in ";
- MF.getFunction().printAsOperand(dbgs(), false);
- dbgs() << '\n';
- });
+ const Function &F = MF.getFunction();
// Maps optimizable regions (i.e., regions at minimum and register-limited
// occupancy, or regions with spilling) to the target RP we would like to
// reach.
DenseMap<unsigned, GCNRPTarget> OptRegions;
- const Function &F = MF.getFunction();
- unsigned DynamicVGPRBlockSize =
- MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
- std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
+ // Define "spilling targets" for spilling regions. This takes into account any
+ // SGPR/VGPR usage restriction requested through the "amdgpu-num-sgpr" /
+ // "amdgpu-num-vgpr" attributes beyond the limits imposed by the minimum
+ // number of waves per EU. Usage above those restrictions is considered like
+ // spill.
const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
- const unsigned MaxSGPRsIncOcc =
- ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
- const unsigned MaxVGPRsIncOcc =
- ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
- IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
-
- // Collect optimizable regions. If there is spilling in any region we will
- // just try to reduce spilling. Otherwise we will try to increase occupancy by
- // one in the whole function.
+ unsigned MinOcc = MFI.getMaxWavesPerEU();
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- GCNRegPressure &RP = DAG.Pressure[I];
- // We allow ArchVGPR or AGPR savings to count as savings of the other kind
- // of VGPR only when trying to eliminate spilling. We cannot do this when
- // trying to increase occupancy since VGPR class swaps only occur later in
- // the register allocator i.e., the scheduler will not be able to reason
- // about these savings and will not report an increase in the achievable
- // occupancy, triggering rollbacks.
+ const GCNRegPressure &RP = DAG.Pressure[I];
GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP,
- /*CombineVGPRSavings=*/true);
- if (!Target.satisfied() && IncreaseOccupancy) {
- // There is spilling in the region and we were so far trying to increase
- // occupancy. Strop trying that and focus on reducing spilling.
- IncreaseOccupancy = false;
- OptRegions.clear();
- } else if (IncreaseOccupancy) {
- // There is no spilling in the region, try to increase occupancy.
- Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP,
- /*CombineVGPRSavings=*/false);
- }
+ /*CombineVGPRSavings=*/RematCombineVGPRSavings);
+ MinOcc = std::min(MinOcc, Target.getOccupancy());
if (!Target.satisfied())
OptRegions.insert({I, Target});
}
- if (OptRegions.empty())
- return false;
-#ifndef NDEBUG
- if (IncreaseOccupancy) {
- REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy
- << ") in regions:\n");
+ if (!OptRegions.empty() || MinOcc >= MFI.getMaxWavesPerEU()) {
+ // There is spilling in at least one region, or we are already at maximum
+ // occupancy.
+ TargetOcc = std::nullopt;
} else {
- REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy ("
- << WavesPerEU.first << ") in regions:\n");
- }
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
- REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n');
+ // There is no spilling and room to improve occupancy; set up "increased
+ // occupancy" targets for all regions. We further restrict the SGPR/VGPR
+ // limits for increasing occupancy by the "spilling limits" since the latter
+ // may end up smaller due to "amdgpu-num-sgpr" / "amdgpu-num-vgpr"
+ // attributes.
+ TargetOcc = MinOcc + 1;
+
+ unsigned DynamicVGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ const unsigned MaxSGPRsIncOcc =
+ std::min(MaxSGPRsNoSpill, ST.getMaxNumSGPRs(*TargetOcc, false));
+ const unsigned MaxVGPRsIncOcc = std::min(
+ MaxVGPRsNoSpill, ST.getMaxNumVGPRs(*TargetOcc, DynamicVGPRBlockSize));
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ const GCNRegPressure &RP = DAG.Pressure[I];
+ GCNRPTarget Target(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP,
+ /*CombineVGPRSavings=*/RematCombineVGPRSavings);
+ if (!Target.satisfied())
+ OptRegions.insert({I, Target});
+ }
+ assert(!OptRegions.empty() && "there should be at least one target region");
}
-#endif
- // When we are reducing spilling, the target is the minimum target number of
- // waves/EU determined by the subtarget. In cases where either one of
- // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current
- // minimum region occupancy may be higher than the latter.
- TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1
- : std::max(DAG.MinOccupancy, WavesPerEU.first);
+ REMAT_DEBUG({
+ dbgs() << "Analyzing ";
+ MF.getFunction().printAsOperand(dbgs(), false);
+ dbgs() << ": ";
+ if (OptRegions.empty()) {
+ LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
+ << MFI.getMaxWavesPerEU() << "\n");
+ } else if (!TargetOcc) {
+ LLVM_DEBUG(dbgs() << "reduce spilling (minimum target occupancy is "
+ << MFI.getMinWavesPerEU() << ")\n");
+ } else {
+ LLVM_DEBUG(dbgs() << "increase occupancy from " << MinOcc << " to "
+ << TargetOcc << '\n');
+ }
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
+ REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n');
+ }
+ });
+ if (OptRegions.empty())
+ return false;
// Accounts for a reduction in RP in an optimizable region. Returns whether we
// estimate that we have identified enough rematerialization opportunities to
@@ -1780,10 +1802,10 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
bool &Progress) -> bool {
GCNRPTarget &Target = OptIt->getSecond();
- if (!Target.isSaveBeneficial(Reg, DAG.MRI))
+ if (!Target.isSaveBeneficial(Reg))
return false;
Progress = true;
- Target.saveReg(Reg, Mask, DAG.MRI);
+ Target.saveReg(Reg, Mask);
if (Target.satisfied())
OptRegions.erase(OptIt->getFirst());
return OptRegions.empty();
@@ -1889,7 +1911,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
}
}
- if (IncreaseOccupancy) {
+ if (TargetOcc) {
// We were trying to increase occupancy but failed, abort the stage.
REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
Rematerializations.clear();
@@ -1994,7 +2016,7 @@ void PreRARematStage::rematerialize() {
// All regions impacted by at least one rematerialization must be rescheduled.
// Maximum pressure must also be recomputed for all regions where it changed
// non-predictably and checked against the target occupancy.
- AchievedOcc = TargetOcc;
+ AchievedOcc = MFI.getMaxWavesPerEU();
for (auto &[I, OriginalRP] : ImpactedRegions) {
bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
RescheduleRegions[I] = !IsEmptyRegion;
@@ -2018,9 +2040,10 @@ void PreRARematStage::rematerialize() {
}
}
DAG.Pressure[I] = RP;
- AchievedOcc = std::min(
- AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
- ->getDynamicVGPRBlockSize()));
+ unsigned NewOcc = RP.getOccupancy(
+ ST, MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(),
+ RematCombineVGPRSavings);
+ AchievedOcc = std::min(AchievedOcc, NewOcc);
}
REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
}
@@ -2050,7 +2073,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
// which case we do not want to rollback either (the rescheduling was already
// reverted in PreRARematStage::shouldRevertScheduling in such cases).
unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
- if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
+ if (!TargetOcc || MaxOcc >= *TargetOcc)
return;
REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 94cd795bbc8f6..09ce8751fa95a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -473,15 +473,12 @@ class PreRARematStage : public GCNSchedStage {
/// After successful stage initialization, indicates which regions should be
/// rescheduled.
BitVector RescheduleRegions;
- /// Target occupancy the stage estimates is reachable through
- /// rematerialization. Greater than or equal to the pre-stage min occupancy.
- unsigned TargetOcc;
+ /// The target occupancy the stage is trying to achieve. Empty when the
+ /// objective is spilling reduction.
+ std::optional<unsigned> TargetO...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
0908c8a to
0f1e38d
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it is only beneficial to reduce the OtherRC if the OtherRC exceeds the addressable limit for that RC and the combined pressure is above the addressable limit for unified RF. In this case we cannot use copies / LiveRange splitting alone to allocate the registers, we must spill.
Since we know the current RC is less than the addressable limit, then it may be enough to just check the unified RP against the addressable limit for unified RF.
By reducing cross RC pressure any time we're over the MaxUnifiedVGPRs, we are telling the rematerializer to issue cross RC copies to increase occupancy.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By reducing cross RC pressure any time we're over the MaxUnifiedVGPRs, we are telling the rematerializer to issue cross RC copies to increase occupancy.
Apologies, I am not sure I understand.
I guess we agree on the spilling case (
For the occupancy increase case (
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I think the way you've implemented it is fine actually.
I was looking at a case where we use the waves-per-eu attribute. Unless amdgpu-agpr-alloc is also specified, we will just split the unified RF in half during RA. In that case, with the current implementation, we would potentially be inserting cross RC copies to increase occupancy. I'm not sure how concerned we should be with this case, since for target's with UnifiedRF, we should be using pure vgpr unless in occupancy 1 case.
However, in the default case (no attributes), RA can allocate up to the maxNumVGPRs for each RC, which should encourage a more optimal split while honoring the unified limit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for doing the investigation and for the insights, I'll keep them in mind for my future remat work.
jrbyrnes
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: are any of these comments still relevant?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes these are still relevant
It's actually possible to hit the assert on a valid IR if both amdgpu-num-sgprs and amdgpu-num-vgprs are defined and have very low values. It's also unnecessary to take the minimum between the spilling and increase occupancy targets, which will be necessarily higher and therefore automatically satisfied even when these attributes are defined.
0ea15df to
fa1524d
Compare
…lization (llvm#149224) The `RPTarget`'s way of determining whether VGPRs are beneficial to save and whether the target has been reached w.r.t. VGPR usage currently assumes, if `CombinedVGPRSavings` is true, that free slots in one VGPR RC can always be used for the other. Implicitly, this makes the rematerialization stage (only current user of `RPTarget`) follow a different occupancy calculation than the "regular one" that the scheduler uses, one that assumes that ArchVGPR/AGPR usage can be balanced perfectly and at no cost, which is untrue in general. This ultimately yields suboptimal rematerialization decisions that require cross-VGPR-RC copies unnecessarily. This fixes that, making the `RPTarget`'s internal model of occupancy consistent with the regular one. The `CombinedVGPRSavings` flag is removed, and a form of cross-VGPR-RC saving implemented only for unified RFs, which is where it makes the most sense. Only when the amount of free VGPRs in a given VGPR RC (ArchVPGR or AGPR) is lower than the excess VGPR usage in the other VGPR RC does the `RPTarget` consider that a pressure reduction in the former will be beneficial to the latter.
…lization (llvm#149224) The `RPTarget`'s way of determining whether VGPRs are beneficial to save and whether the target has been reached w.r.t. VGPR usage currently assumes, if `CombinedVGPRSavings` is true, that free slots in one VGPR RC can always be used for the other. Implicitly, this makes the rematerialization stage (only current user of `RPTarget`) follow a different occupancy calculation than the "regular one" that the scheduler uses, one that assumes that ArchVGPR/AGPR usage can be balanced perfectly and at no cost, which is untrue in general. This ultimately yields suboptimal rematerialization decisions that require cross-VGPR-RC copies unnecessarily. This fixes that, making the `RPTarget`'s internal model of occupancy consistent with the regular one. The `CombinedVGPRSavings` flag is removed, and a form of cross-VGPR-RC saving implemented only for unified RFs, which is where it makes the most sense. Only when the amount of free VGPRs in a given VGPR RC (ArchVPGR or AGPR) is lower than the excess VGPR usage in the other VGPR RC does the `RPTarget` consider that a pressure reduction in the former will be beneficial to the latter.
The
RPTarget's way of determining whether VGPRs are beneficial to save and whether the target has been reached w.r.t. VGPR usage currently assumes, ifCombinedVGPRSavingsis true, that free slots in one VGPR RC can always be used for the other. Implicitly, this makes the rematerialization stage (only current user ofRPTarget) follow a different occupancy calculation than the "regular one" that the scheduler uses, one that assumes that ArchVGPR/AGPR usage can be balanced perfectly and at no cost, which is untrue in general. This ultimately yields suboptimal rematerialization decisions that require cross-VGPR-RC copies unnecessarily.This fixes that, making the
RPTarget's internal model of occupancy consistent with the regular one. TheCombinedVGPRSavingsflag is removed, and a form of cross-VGPR-RC saving implemented only for unified RFs, which is where it makes the most sense. Only when the amount of free VGPRs in a given VGPR RC (ArchVPGR or AGPR) is lower than the excess VGPR usage in the other VGPR RC does theRPTargetconsider that a pressure reduction in the former will be beneficial to the latter.