Skip to content

Commit 6e0e80e

Browse files
committed
Address feedback
- Properly account for AGPRs in excess RP calculations. - Account for SGPR spilling when deciding whether to try to increase occupancy. - Don't rollback when rescheduling managed to improve occuoancy after remats.
1 parent daf4723 commit 6e0e80e

22 files changed

+944
-385
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 113 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1675,6 +1675,41 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
16751675
return true;
16761676
}
16771677

1678+
bool PreRARematStage::hasExcessVGPRs(const GCNRegPressure &RP,
1679+
unsigned MaxVGPRs,
1680+
unsigned &ExcessArchVGPRs,
1681+
bool &AGPRLimited) {
1682+
unsigned NumAGPRs = RP.getAGPRNum();
1683+
if (!ST.hasGFX90AInsts() || !NumAGPRs) {
1684+
// Non-unified RF. We can only reduce ArchVGPR excess pressure at this
1685+
// point, but still want to identify when there is AGPR excess pressure.
1686+
bool HasSpill = false;
1687+
unsigned NumArchVGPRs = RP.getArchVGPRNum();
1688+
if (NumArchVGPRs > MaxVGPRs) {
1689+
ExcessArchVGPRs = NumArchVGPRs - MaxVGPRs;
1690+
HasSpill = true;
1691+
}
1692+
if (NumAGPRs > MaxVGPRs) {
1693+
AGPRLimited = true;
1694+
HasSpill = true;
1695+
}
1696+
return HasSpill;
1697+
}
1698+
if (RP.getVGPRNum(true) > MaxVGPRs) {
1699+
// Unified RF. We can only remat ArchVGPRs; AGPR pressure alone may prevent
1700+
// us from eliminating spilling.
1701+
unsigned NumArchVGPRs = RP.getArchVGPRNum();
1702+
if (NumAGPRs >= MaxVGPRs) {
1703+
AGPRLimited = true;
1704+
ExcessArchVGPRs = NumArchVGPRs;
1705+
} else {
1706+
ExcessArchVGPRs = NumArchVGPRs - alignDown(MaxVGPRs - NumAGPRs, 4);
1707+
}
1708+
return true;
1709+
}
1710+
return false;
1711+
}
1712+
16781713
bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
16791714
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
16801715

@@ -1684,51 +1719,86 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
16841719
// Maps optimizable regions (i.e., regions at minimum and VGPR-limited
16851720
// occupancy, or regions with VGPR spilling) to their excess RP.
16861721
DenseMap<unsigned, unsigned> OptRegions;
1687-
1688-
// Note that the maximum number of VGPRs to use to eliminate spill may be
1689-
// lower than the maximum number to increase occupancy when the function has
1690-
// the "amdgpu-num-vgpr" attribute.
1691-
const std::pair<unsigned, unsigned> OccBounds =
1722+
const Function &F = MF.getFunction();
1723+
const bool UnifiedRF = ST.hasGFX90AInsts();
1724+
1725+
// Adjust workgroup size induced occupancy bounds with the
1726+
// "amdgpu-waves-per-eu" attribute. This should be offloaded to a subtarget
1727+
// method, but at this point is if unclear how other parts of the codebase
1728+
// interpret this attribute and the default behavior produces unexpected
1729+
// bounds. Here we want to allow users to ask for target occupancies lower
1730+
// than the default lower bound.
1731+
std::pair<unsigned, unsigned> OccBounds =
16921732
ST.getOccupancyWithWorkGroupSizes(MF);
1693-
// FIXME: we should be able to just call ST.getMaxNumArchVGPRs() but that
1694-
// would use the occupancy bounds as determined by
1695-
// MF.getFunction().getWavesPerEU(), which look incorrect in some cases.
1733+
std::pair<unsigned, unsigned> WavesPerEU =
1734+
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1735+
if (WavesPerEU.first <= WavesPerEU.second) {
1736+
if (WavesPerEU.first && WavesPerEU.first <= OccBounds.second)
1737+
OccBounds.first = WavesPerEU.first;
1738+
if (WavesPerEU.second)
1739+
OccBounds.second = std::min(OccBounds.second, WavesPerEU.second);
1740+
}
1741+
1742+
// We call the "base max functions" directly because otherwise it uses the
1743+
// subtarget's logic for combining "amdgpu-waves-per-eu" with the function's
1744+
// groupsize induced occupancy bounds, producing unexpected results.
1745+
const unsigned MaxSGPRsNoSpill = ST.getBaseMaxNumSGPRs(
1746+
F, OccBounds, ST.getMaxNumPreloadedSGPRs(), ST.getReservedNumSGPRs(F));
16961747
const unsigned MaxVGPRsNoSpill =
1697-
ST.getBaseMaxNumVGPRs(MF.getFunction(),
1698-
{ST.getMinNumArchVGPRs(OccBounds.second),
1699-
ST.getMaxNumArchVGPRs(OccBounds.first)},
1700-
false);
1701-
const unsigned MaxVGPRsIncOcc = ST.getMaxNumArchVGPRs(DAG.MinOccupancy + 1);
1748+
ST.getBaseMaxNumVGPRs(F, {ST.getMinNumVGPRs(OccBounds.second),
1749+
ST.getMaxNumVGPRs(OccBounds.first)});
1750+
const unsigned MaxSGPRsMinOcc = ST.getMaxNumSGPRs(DAG.MinOccupancy, false);
1751+
const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
17021752
IncreaseOccupancy = OccBounds.second > DAG.MinOccupancy;
17031753

1754+
auto ClearOptRegionsIf = [&](bool Cond) -> bool {
1755+
if (Cond) {
1756+
// We won't try to increase occupancy.
1757+
IncreaseOccupancy = false;
1758+
OptRegions.clear();
1759+
}
1760+
return Cond;
1761+
};
1762+
17041763
// Collect optimizable regions. If there is spilling in any region we will
1705-
// just try to reduce it. Otherwise we will try to increase occupancy by one.
1764+
// just try to reduce ArchVGPR spilling. Otherwise we will try to increase
1765+
// occupancy by one in the whole function.
17061766
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
17071767
GCNRegPressure &RP = DAG.Pressure[I];
1708-
unsigned NumVGPRs = RP.getArchVGPRNum();
17091768
unsigned ExcessRP = 0;
1710-
if (NumVGPRs > MaxVGPRsNoSpill) {
1711-
if (IncreaseOccupancy) {
1712-
// We won't try to increase occupancy.
1713-
IncreaseOccupancy = false;
1714-
OptRegions.clear();
1715-
}
1716-
// Region has VGPR spilling, we will try to reduce spilling as much as
1717-
// possible.
1718-
ExcessRP = NumVGPRs - MaxVGPRsNoSpill;
1719-
REMAT_DEBUG(dbgs() << "Region " << I << " is spilling VGPRs, save "
1720-
<< ExcessRP << " VGPR(s) to eliminate spilling\n");
1769+
unsigned NumSGPRs = RP.getSGPRNum();
1770+
1771+
// Check whether SGPR pressures prevents us from eliminating spilling.
1772+
if (NumSGPRs > MaxSGPRsNoSpill)
1773+
ClearOptRegionsIf(IncreaseOccupancy);
1774+
1775+
bool OccAGPRLimited = false;
1776+
if (hasExcessVGPRs(RP, MaxVGPRsNoSpill, ExcessRP, OccAGPRLimited)) {
1777+
ClearOptRegionsIf(IncreaseOccupancy);
1778+
REMAT_DEBUG({
1779+
if (ExcessRP) {
1780+
StringRef RegClass = UnifiedRF ? "VGPRs" : "ArchVGPRs";
1781+
dbgs() << "Region " << I << " is spilling " << RegClass << ", save "
1782+
<< ExcessRP << " to eliminate " << RegClass << "-spilling\n";
1783+
}
1784+
});
17211785
} else if (IncreaseOccupancy) {
1722-
if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) == DAG.MinOccupancy) {
1723-
// Occupancy is SGPR-limited in the region, no point in trying to
1724-
// increase it through VGPR usage.
1725-
IncreaseOccupancy = false;
1726-
OptRegions.clear();
1727-
} else if (NumVGPRs > MaxVGPRsIncOcc) {
1728-
// Occupancy is VGPR-limited.
1729-
ExcessRP = NumVGPRs - MaxVGPRsIncOcc;
1730-
REMAT_DEBUG(dbgs() << "Region " << I << " has min. occupancy: save "
1731-
<< ExcessRP << " VGPR(s) to improve occupancy\n");
1786+
// Check whether SGPR pressure prevents us from increasing occupancy.
1787+
if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsMinOcc))
1788+
continue;
1789+
1790+
if (hasExcessVGPRs(RP, MaxVGPRsIncOcc, ExcessRP, OccAGPRLimited)) {
1791+
// Check whether AGPR pressure prevents us from increasing occupancy.
1792+
if (ClearOptRegionsIf(OccAGPRLimited))
1793+
continue;
1794+
// Occupancy could be increased by rematerializing ArchVGPRs.
1795+
REMAT_DEBUG({
1796+
if (ExcessRP) {
1797+
StringRef RegClass = UnifiedRF ? "VGPRs" : "ArchVGPRs";
1798+
dbgs() << "Region " << I << " has min. occupancy: save " << ExcessRP
1799+
<< " " << RegClass << " to improve occupancy\n";
1800+
}
1801+
});
17321802
}
17331803
}
17341804
if (ExcessRP)
@@ -1738,7 +1808,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
17381808
return false;
17391809

17401810
// When we are reducing spilling, the target is the minimum achievable
1741-
// occupancy implied by workgroup sizes.
1811+
// occupancy implied by workgroup sizes / the "amdgpu-waves-per-eu" attribute.
17421812
TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : OccBounds.first;
17431813

17441814
// Accounts for a reduction in RP in an optimizable region. Returns whether we
@@ -2058,9 +2128,11 @@ static MachineBasicBlock *getRegionMBB(MachineFunction &MF,
20582128
void PreRARematStage::finalizeGCNSchedStage() {
20592129
// We consider that reducing spilling is always beneficial so we never
20602130
// rollback rematerializations in such cases. It's also possible that
2061-
// rescheduling lowers occupancy over the one achived just through remats, in
2062-
// which case we do not want to rollback either.
2063-
if (!IncreaseOccupancy || AchievedOcc == TargetOcc)
2131+
// rescheduling lowers occupancy over the one achieved just through remats, in
2132+
// which case we do not want to rollback either (the rescheduling was already
2133+
// reverted in PreRARematStage::shouldRevertScheduling in such cases).
2134+
unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
2135+
if (!IncreaseOccupancy || MaxOcc == TargetOcc)
20642136
return;
20652137

20662138
REMAT_DEBUG(dbgs() << "Rollbacking all rematerializations\n");
@@ -2077,10 +2149,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
20772149

20782150
// Re-rematerialize MI at the end of its original region. Note that it may
20792151
// not be rematerialized exactly in the same position as originally within
2080-
// the region, but it should not matter much. Since we are only
2081-
// rematerializing instructions that do not have any virtual reg uses, we
2082-
// do not need to call LiveRangeEdit::allUsesAvailableAt() and
2083-
// LiveRangeEdit::canRematerializeAt().
2152+
// the region, but it should not matter much.
20842153
TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
20852154
MachineInstr *NewMI = &*std::prev(InsertPos);
20862155
NewMI->getOperand(0).setSubReg(SubReg);

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,13 @@ class PreRARematStage : public GCNSchedStage {
485485
/// spilling.
486486
bool IncreaseOccupancy;
487487

488+
/// Determines whether there is excess VGPR pressure in \p RP w.r.t. \p
489+
/// MaxVGPRs. If there is, \p ExcessArchVGPRs is set to the number of
490+
/// ArchVGPRs one must save to eliminate the excess and \p AGPRLimited is set
491+
/// to true if AGPR pressure alone causes an excess.
492+
bool hasExcessVGPRs(const GCNRegPressure &RP, unsigned MaxVGPRs,
493+
unsigned &ExcessArchVGPRs, bool &AGPRLimited);
494+
488495
/// Returns whether remat can reduce spilling or increase function occupancy
489496
/// by 1 through rematerialization. If it can do one, collects instructions in
490497
/// PreRARematStage::Rematerializations and sets the target occupancy in

llvm/lib/Target/AMDGPU/GCNSubtarget.cpp

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
466466
getReservedNumSGPRs(MF));
467467
}
468468

469-
static unsigned getMaxNumPreloadedSGPRs() {
469+
unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
470470
using USI = GCNUserSGPRUsageInfo;
471471
// Max number of user SGPRs
472472
const unsigned MaxUserSGPRs =
@@ -496,10 +496,8 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
496496
getReservedNumSGPRs(F));
497497
}
498498

499-
unsigned
500-
GCNSubtarget::getBaseMaxNumVGPRs(const Function &F,
501-
std::pair<unsigned, unsigned> NumVGPRBounds,
502-
bool UnifiedRF) const {
499+
unsigned GCNSubtarget::getBaseMaxNumVGPRs(
500+
const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
503501
const auto &[Min, Max] = NumVGPRBounds;
504502

505503
// Check if maximum number of VGPRs was explicitly requested using
@@ -510,7 +508,7 @@ GCNSubtarget::getBaseMaxNumVGPRs(const Function &F,
510508
unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
511509
if (!Requested)
512510
return Max;
513-
if (UnifiedRF)
511+
if (hasGFX90AInsts())
514512
Requested *= 2;
515513

516514
// Make sure requested value is inside the range of possible VGPR usage.
@@ -520,15 +518,7 @@ GCNSubtarget::getBaseMaxNumVGPRs(const Function &F,
520518
unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
521519
std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
522520
return getBaseMaxNumVGPRs(
523-
F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)},
524-
hasGFX90AInsts());
525-
}
526-
527-
unsigned GCNSubtarget::getMaxNumArchVGPRs(const Function &F) const {
528-
std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
529-
return getBaseMaxNumVGPRs(
530-
F, {getMinNumArchVGPRs(Waves.second), getMaxNumArchVGPRs(Waves.first)},
531-
false);
521+
F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)});
532522
}
533523

534524
unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1487,6 +1487,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
14871487
/// \returns Reserved number of SGPRs for given function \p F.
14881488
unsigned getReservedNumSGPRs(const Function &F) const;
14891489

1490+
/// \returns Maximum number of preloaded SGPRs for the subtarget.
1491+
unsigned getMaxNumPreloadedSGPRs() const;
1492+
14901493
/// \returns max num SGPRs. This is the common utility
14911494
/// function called by MachineFunction and Function
14921495
/// variants of getMaxNumSGPRs.
@@ -1547,29 +1550,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
15471550
return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
15481551
}
15491552

1550-
/// \returns the minimum number of ArchVGPRs that will prevent achieving more
1551-
/// than the specified number of waves \p WavesPerEU.
1552-
unsigned getMinNumArchVGPRs(unsigned WavesPerEU) const {
1553-
return AMDGPU::IsaInfo::getMinNumArchVGPRs(this, WavesPerEU);
1554-
}
1555-
15561553
/// \returns the maximum number of VGPRs that can be used and still achieved
15571554
/// at least the specified number of waves \p WavesPerEU.
15581555
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
15591556
return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
15601557
}
15611558

1562-
/// \returns the maximum number of ArchVGPRs that can be used and still
1563-
/// achieve at least the specified number of waves \p WavesPerEU.
1564-
unsigned getMaxNumArchVGPRs(unsigned WavesPerEU) const {
1565-
return AMDGPU::IsaInfo::getMaxNumArchVGPRs(this, WavesPerEU);
1566-
}
1567-
1568-
/// \returns max num VGPRs. This is the common utility function called by
1569-
/// MachineFunction and Function variants of getMaxNum[Arch]VGPRs.
1559+
/// \returns max num VGPRs. This is the common utility function
1560+
/// called by MachineFunction and Function variants of getMaxNumVGPRs.
15701561
unsigned getBaseMaxNumVGPRs(const Function &F,
1571-
std::pair<unsigned, unsigned> NumVGPRBounds,
1572-
bool UnifiedRF) const;
1562+
std::pair<unsigned, unsigned> NumVGPRBounds) const;
15731563

15741564
/// \returns Maximum number of VGPRs that meets number of waves per execution
15751565
/// unit requirement for function \p F, or number of VGPRs explicitly
@@ -1581,12 +1571,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
15811571
/// unit requirement.
15821572
unsigned getMaxNumVGPRs(const Function &F) const;
15831573

1584-
/// Returns the maximum number of ArchVGPRs that meets number of waves per
1585-
/// execution unit requirement for function \p F, or number of ArchVGPRs
1586-
/// explicitly requested using "amdgpu-num-vgpr" attribute attached to
1587-
/// function \p F.
1588-
unsigned getMaxNumArchVGPRs(const Function &F) const;
1589-
15901574
unsigned getMaxNumAGPRs(const Function &F) const {
15911575
return getMaxNumVGPRs(F);
15921576
}

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 13 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,36 +1240,28 @@ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
12401240
return 5;
12411241
}
12421242

1243-
unsigned getBaseMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
1244-
unsigned TotalNumVGPRs,
1245-
unsigned NumAddressableVGPRs) {
1243+
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
12461244
assert(WavesPerEU != 0);
1245+
12471246
unsigned MaxWavesPerEU = getMaxWavesPerEU(STI);
12481247
if (WavesPerEU >= MaxWavesPerEU)
12491248
return 0;
1250-
unsigned MinWavesPerEU =
1251-
getNumWavesPerEUWithNumVGPRs(STI, NumAddressableVGPRs);
1252-
if (WavesPerEU < MinWavesPerEU)
1253-
return getMinNumVGPRs(STI, MinWavesPerEU);
12541249

1250+
unsigned TotNumVGPRs = getTotalNumVGPRs(STI);
1251+
unsigned AddrsableNumVGPRs = getAddressableNumVGPRs(STI);
12551252
unsigned Granule = getVGPRAllocGranule(STI);
1256-
unsigned MaxNumVGPRs = alignDown(TotalNumVGPRs / WavesPerEU, Granule);
1257-
if (MaxNumVGPRs == alignDown(TotalNumVGPRs / MaxWavesPerEU, Granule))
1253+
unsigned MaxNumVGPRs = alignDown(TotNumVGPRs / WavesPerEU, Granule);
1254+
1255+
if (MaxNumVGPRs == alignDown(TotNumVGPRs / MaxWavesPerEU, Granule))
12581256
return 0;
1259-
unsigned MaxNumVGPRsNext =
1260-
alignDown(TotalNumVGPRs / (WavesPerEU + 1), Granule);
1261-
unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
1262-
return std::min(MinNumVGPRs, NumAddressableVGPRs);
1263-
}
12641257

1265-
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
1266-
return getBaseMinNumVGPRs(STI, WavesPerEU, getTotalNumVGPRs(STI),
1267-
getAddressableNumVGPRs(STI));
1268-
}
1258+
unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs);
1259+
if (WavesPerEU < MinWavesPerEU)
1260+
return getMinNumVGPRs(STI, MinWavesPerEU);
12691261

1270-
unsigned getMinNumArchVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
1271-
unsigned TotNumArchVGPRs = getAddressableNumArchVGPRs(STI);
1272-
return getBaseMinNumVGPRs(STI, WavesPerEU, TotNumArchVGPRs, TotNumArchVGPRs);
1262+
unsigned MaxNumVGPRsNext = alignDown(TotNumVGPRs / (WavesPerEU + 1), Granule);
1263+
unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
1264+
return std::min(MinNumVGPRs, AddrsableNumVGPRs);
12731265
}
12741266

12751267
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
@@ -1281,12 +1273,6 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
12811273
return std::min(MaxNumVGPRs, AddressableNumVGPRs);
12821274
}
12831275

1284-
unsigned getMaxNumArchVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
1285-
assert(WavesPerEU != 0);
1286-
return alignDown(getAddressableNumArchVGPRs(STI) / WavesPerEU,
1287-
getVGPRAllocGranule(STI));
1288-
}
1289-
12901276
unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,
12911277
std::optional<bool> EnableWavefrontSize32) {
12921278
return getGranulatedNumRegisterBlocks(

0 commit comments

Comments
 (0)