Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 29 additions & 27 deletions llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,46 +368,45 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
////////////////////////////////////////////////////////////////////////////////
// GCNRPTarget

GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
bool CombineVGPRSavings)
: RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP)
: GCNRPTarget(RP, MF) {
const Function &F = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
setRegLimits(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), MF);
setTarget(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F));
}

GCNRPTarget::GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs,
const MachineFunction &MF, const GCNRegPressure &RP,
bool CombineVGPRSavings)
: RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
setRegLimits(NumSGPRs, NumVGPRs, MF);
const MachineFunction &MF, const GCNRegPressure &RP)
: GCNRPTarget(RP, MF) {
setTarget(NumSGPRs, NumVGPRs);
}

GCNRPTarget::GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
const GCNRegPressure &RP, bool CombineVGPRSavings)
: RP(RP), CombineVGPRSavings(CombineVGPRSavings) {
const GCNRegPressure &RP)
: GCNRPTarget(RP, MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
setRegLimits(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize), MF);
setTarget(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false),
ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize));
}

void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs,
const MachineFunction &MF) {
void GCNRPTarget::setTarget(unsigned NumSGPRs, unsigned NumVGPRs) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs);
MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs);
MaxUnifiedVGPRs =
ST.hasGFX90AInsts()
? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs)
: 0;
if (UnifiedRF) {
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
MaxUnifiedVGPRs =
std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs);
} else {
MaxUnifiedVGPRs = 0;
}
}

bool GCNRPTarget::isSaveBeneficial(Register Reg,
const MachineRegisterInfo &MRI) const {
bool GCNRPTarget::isSaveBeneficial(Register Reg) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
Expand All @@ -416,16 +415,19 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg,
return RP.getSGPRNum() > MaxSGPRs;
unsigned NumVGPRs =
SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum();
return isVGPRBankSaveBeneficial(NumVGPRs);
// The addressable limit must always be respected.
if (NumVGPRs > MaxVGPRs)
return true;
// For unified RFs, combined VGPR usage limit must be respected as well.
return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is only beneficial to reduce the OtherRC if the OtherRC exceeds the addressable limit for that RC and the combined pressure is above the addressable limit for unified RF. In this case we cannot use copies / LiveRange splitting alone to allocate the registers, we must spill.

Since we know the current RC is less than the addressable limit, then it may be enough to just check the unified RP against the addressable limit for unified RF.

By reducing cross RC pressure any time we're over the MaxUnifiedVGPRs, we are telling the rematerializer to issue cross RC copies to increase occupancy.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By reducing cross RC pressure any time we're over the MaxUnifiedVGPRs, we are telling the rematerializer to issue cross RC copies to increase occupancy.

Apologies, I am not sure I understand.

I guess we agree on the spilling case ($MaxVGPRs=256 \wedge MaxUnifiedVGPRs=512$) since in that case $NumVGPRsInRC \leq MaxVGPRs \wedge RP.getVGPRNum(true) &gt; MaxUnifiedVGPRs \Longrightarrow NumVGPRsInOtherRC &gt; MaxVGPRs$ (modulo the VGPR allocation granule in the unified computation) i.e., we only do cross-RC saves if there are too many excess VGPRs in the other RC to fit through copies in the current RC.

For the occupancy increase case ($0&lt;MaxVGPRs=MaxUnifiedVGPRs\leq256$) we always have $NumVGPRsInRC&lt;256$ and $NumVGPRsInOtherRC&lt;256$ otherwise the stage would be trying to reduce spilling. If $NumVGPRsInRC \leq MaxVGPRs \wedge RP.getVGPRNum(true) &gt; MaxUnifiedVGPRs$, isn't any VGPR/AGPR save beneficial? Is there a chance we increase the number of cross RC copies by always saving there?

Copy link
Contributor

@jrbyrnes jrbyrnes Aug 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I think the way you've implemented it is fine actually.

I was looking at a case where we use the waves-per-eu attribute. Unless amdgpu-agpr-alloc is also specified, we will just split the unified RF in half during RA. In that case, with the current implementation, we would potentially be inserting cross RC copies to increase occupancy. I'm not sure how concerned we should be with this case, since for target's with UnifiedRF, we should be using pure vgpr unless in occupancy 1 case.

However, in the default case (no attributes), RA can allocate up to the maxNumVGPRs for each RC, which should encourage a more optimal split while honoring the unified limit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for doing the investigation and for the insights, I'll keep them in mind for my future remat work.

}

bool GCNRPTarget::satisfied() const {
if (RP.getSGPRNum() > MaxSGPRs)
if (RP.getSGPRNum() > MaxSGPRs || RP.getVGPRNum(false) > MaxVGPRs)
return false;
if (RP.getVGPRNum(false) > MaxVGPRs &&
(!CombineVGPRSavings || !satisifiesVGPRBanksTarget()))
if (UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs)
return false;
return satisfiesUnifiedTarget();
return true;
}

///////////////////////////////////////////////////////////////////////////////
Expand Down
43 changes: 13 additions & 30 deletions llvm/lib/Target/AMDGPU/GCNRegPressure.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,28 +186,30 @@ class GCNRPTarget {
/// Sets up the target such that the register pressure starting at \p RP does
/// not show register spilling on function \p MF (w.r.t. the function's
/// mininum target occupancy).
GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP,
bool CombineVGPRSavings = false);
GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP);

/// Sets up the target such that the register pressure starting at \p RP does
/// not use more than \p NumSGPRs SGPRs and \p NumVGPRs VGPRs on function \p
/// MF.
GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, const MachineFunction &MF,
const GCNRegPressure &RP, bool CombineVGPRSavings = false);
const GCNRegPressure &RP);

/// Sets up the target such that the register pressure starting at \p RP does
/// not prevent achieving an occupancy of at least \p Occupancy on function
/// \p MF.
GCNRPTarget(unsigned Occupancy, const MachineFunction &MF,
const GCNRegPressure &RP, bool CombineVGPRSavings = false);
const GCNRegPressure &RP);

/// Changes the target (same semantics as constructor).
void setTarget(unsigned NumSGPRs, unsigned NumVGPRs);

const GCNRegPressure &getCurrentRP() const { return RP; }

void setRP(const GCNRegPressure &NewRP) { RP = NewRP; }

/// Determines whether saving virtual register \p Reg will be beneficial
/// towards achieving the RP target.
bool isSaveBeneficial(Register Reg, const MachineRegisterInfo &MRI) const;
bool isSaveBeneficial(Register Reg) const;

/// Saves virtual register \p Reg with lanemask \p Mask.
void saveReg(Register Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI) {
Expand All @@ -227,15 +229,15 @@ class GCNRPTarget {
if (Target.MaxUnifiedVGPRs) {
OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs
<< " VGPRs (unified)";
} else if (Target.CombineVGPRSavings) {
OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/'
<< 2 * Target.MaxVGPRs << " VGPRs (combined target)";
}
return OS;
}
#endif

private:
const MachineFunction &MF;
const bool UnifiedRF;

/// Current register pressure.
GCNRegPressure RP;

Expand All @@ -246,29 +248,10 @@ class GCNRPTarget {
/// Target number of overall VGPRs for subtargets with unified RFs. Always 0
/// for subtargets with non-unified RFs.
unsigned MaxUnifiedVGPRs;
/// Whether we consider that the register allocator will be able to swap
/// between ArchVGPRs and AGPRs by copying them to a super register class.
/// Concretely, this allows savings in one of the VGPR banks to help toward
/// savings in the other VGPR bank.
bool CombineVGPRSavings;

inline bool satisifiesVGPRBanksTarget() const {
assert(CombineVGPRSavings && "only makes sense with combined savings");
return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs;
}

/// Always satisified when the subtarget doesn't have a unified RF.
inline bool satisfiesUnifiedTarget() const {
return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs;
}

inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const {
return NumVGPRs > MaxVGPRs || !satisfiesUnifiedTarget() ||
(CombineVGPRSavings && !satisifiesVGPRBanksTarget());
}

void setRegLimits(unsigned MaxSGPRs, unsigned MaxVGPRs,
const MachineFunction &MF);
GCNRPTarget(const GCNRegPressure &RP, const MachineFunction &MF)
: MF(MF), UnifiedRF(MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()),
RP(RP) {}
};

///////////////////////////////////////////////////////////////////////////////
Expand Down
145 changes: 68 additions & 77 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1086,7 +1086,8 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
}

/// Allows to easily filter for this stage's debug output.
#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
#define REMAT_PREFIX "[PreRARemat] "
#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)

bool PreRARematStage::initGCNSchedStage() {
// FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
Expand Down Expand Up @@ -1115,10 +1116,15 @@ bool PreRARematStage::initGCNSchedStage() {
rematerialize();
if (GCNTrackers)
DAG.RegionLiveOuts.buildLiveRegMap();
REMAT_DEBUG(
dbgs() << "Retrying function scheduling with new min. occupancy of "
<< AchievedOcc << " from rematerializing (original was "
<< DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
REMAT_DEBUG({
dbgs() << "Retrying function scheduling with new min. occupancy of "
<< AchievedOcc << " from rematerializing (original was "
<< DAG.MinOccupancy;
if (TargetOcc)
dbgs() << ", target was " << *TargetOcc;
dbgs() << ")\n";
});

if (AchievedOcc > DAG.MinOccupancy) {
DAG.MinOccupancy = AchievedOcc;
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
Expand Down Expand Up @@ -1540,8 +1546,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {

bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
mayCauseSpilling(WavesAfter) ||
(IncreaseOccupancy && WavesAfter < TargetOcc);
mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
}

bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
Expand Down Expand Up @@ -1687,78 +1692,63 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
}

bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
REMAT_DEBUG({
dbgs() << "Collecting rematerializable instructions in ";
MF.getFunction().printAsOperand(dbgs(), false);
dbgs() << '\n';
});
const Function &F = MF.getFunction();

// Maps optimizable regions (i.e., regions at minimum and register-limited
// occupancy, or regions with spilling) to the target RP we would like to
// reach.
DenseMap<unsigned, GCNRPTarget> OptRegions;
const Function &F = MF.getFunction();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();

std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
const unsigned MaxSGPRsIncOcc =
ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
const unsigned MaxVGPRsIncOcc =
ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;

// Collect optimizable regions. If there is spilling in any region we will
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: are any of these comments still relevant?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes these are still relevant

// just try to reduce spilling. Otherwise we will try to increase occupancy by
// one in the whole function.
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
GCNRegPressure &RP = DAG.Pressure[I];
// We allow ArchVGPR or AGPR savings to count as savings of the other kind
// of VGPR only when trying to eliminate spilling. We cannot do this when
// trying to increase occupancy since VGPR class swaps only occur later in
// the register allocator i.e., the scheduler will not be able to reason
// about these savings and will not report an increase in the achievable
// occupancy, triggering rollbacks.
GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP,
/*CombineVGPRSavings=*/true);
if (!Target.satisfied() && IncreaseOccupancy) {
// There is spilling in the region and we were so far trying to increase
// occupancy. Strop trying that and focus on reducing spilling.
IncreaseOccupancy = false;
OptRegions.clear();
} else if (IncreaseOccupancy) {
// There is no spilling in the region, try to increase occupancy.
Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP,
/*CombineVGPRSavings=*/false);
unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
auto ResetTargetRegions = [&]() {
OptRegions.clear();
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
const GCNRegPressure &RP = DAG.Pressure[I];
GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
if (!Target.satisfied())
OptRegions.insert({I, Target});
}
if (!Target.satisfied())
OptRegions.insert({I, Target});
}
if (OptRegions.empty())
return false;
};

#ifndef NDEBUG
if (IncreaseOccupancy) {
REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy
<< ") in regions:\n");
ResetTargetRegions();
if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
// In addition to register usage being above addressable limits, occupancy
// below the minimum is considered like "spilling" as well.
TargetOcc = std::nullopt;
} else {
REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy ("
<< WavesPerEU.first << ") in regions:\n");
}
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n');
// There is no spilling and room to improve occupancy; set up "increased
// occupancy targets" for all regions.
TargetOcc = DAG.MinOccupancy + 1;
unsigned VGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
ResetTargetRegions();
}
#endif

// When we are reducing spilling, the target is the minimum target number of
// waves/EU determined by the subtarget. In cases where either one of
// "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current
// minimum region occupancy may be higher than the latter.
TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1
: std::max(DAG.MinOccupancy, WavesPerEU.first);
REMAT_DEBUG({
dbgs() << "Analyzing ";
MF.getFunction().printAsOperand(dbgs(), false);
dbgs() << ": ";
if (OptRegions.empty()) {
dbgs() << "no objective to achieve, occupancy is maximal at "
<< MFI.getMaxWavesPerEU();
} else if (!TargetOcc) {
dbgs() << "reduce spilling (minimum target occupancy is "
<< MFI.getMinWavesPerEU() << ')';
} else {
dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
<< TargetOcc;
}
dbgs() << '\n';
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond()
<< '\n';
}
}
});
if (OptRegions.empty())
return false;

// Accounts for a reduction in RP in an optimizable region. Returns whether we
// estimate that we have identified enough rematerialization opportunities to
Expand All @@ -1767,7 +1757,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
bool &Progress) -> bool {
GCNRPTarget &Target = OptIt->getSecond();
if (!Target.isSaveBeneficial(Reg, DAG.MRI))
if (!Target.isSaveBeneficial(Reg))
return false;
Progress = true;
Target.saveReg(Reg, Mask, DAG.MRI);
Expand Down Expand Up @@ -1876,7 +1866,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
}
}

if (IncreaseOccupancy) {
if (TargetOcc) {
// We were trying to increase occupancy but failed, abort the stage.
REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
Rematerializations.clear();
Expand Down Expand Up @@ -1979,7 +1969,9 @@ void PreRARematStage::rematerialize() {
// All regions impacted by at least one rematerialization must be rescheduled.
// Maximum pressure must also be recomputed for all regions where it changed
// non-predictably and checked against the target occupancy.
AchievedOcc = TargetOcc;
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
AchievedOcc = MFI.getMaxWavesPerEU();
for (auto &[I, OriginalRP] : ImpactedRegions) {
bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
RescheduleRegions[I] = !IsEmptyRegion;
Expand All @@ -2003,9 +1995,8 @@ void PreRARematStage::rematerialize() {
}
}
DAG.Pressure[I] = RP;
AchievedOcc = std::min(
AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
->getDynamicVGPRBlockSize()));
AchievedOcc =
std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
}
REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
}
Expand Down Expand Up @@ -2035,7 +2026,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
// which case we do not want to rollback either (the rescheduling was already
// reverted in PreRARematStage::shouldRevertScheduling in such cases).
unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
if (!TargetOcc || MaxOcc >= *TargetOcc)
return;

REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
Expand Down
Loading