Skip to content

Commit c6027f6

Browse files
committed
Address feedback
- Allow remat in same block if using MI is in different region. Add unit test. - Take into account excess ArchVGPR/AGPR pressure above addressable limits. Add unit tests to make sure this works. - Fix unused register in existing unit test.
1 parent a1f1e4d commit c6027f6

File tree

5 files changed

+2138
-163
lines changed

5 files changed

+2138
-163
lines changed

llvm/lib/Target/AMDGPU/GCNRegPressure.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,19 @@ struct GCNRegPressure {
5353
/// UnifiedVGPRFile
5454
unsigned getVGPRNum(bool UnifiedVGPRFile) const {
5555
if (UnifiedVGPRFile) {
56-
return Value[AGPR32] ? alignTo(Value[VGPR32], 4) + Value[AGPR32]
56+
return Value[AGPR32] ? getUnifiedVGPRNum(Value[VGPR32], Value[AGPR32])
5757
: Value[VGPR32] + Value[AGPR32];
5858
}
5959
return std::max(Value[VGPR32], Value[AGPR32]);
6060
}
61+
62+
/// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
63+
/// and \p NumAGPRs AGPRS, for a target with a unified VGPR file.
64+
inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
65+
unsigned NumAGPRs) {
66+
return alignTo(NumArchVGPRs, 4) + NumAGPRs;
67+
}
68+
6169
/// \returns the ArchVGPR32 pressure
6270
unsigned getArchVGPRNum() const { return Value[VGPR32]; }
6371
/// \returns the AccVGPR32 pressure

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 168 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1682,40 +1682,135 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
16821682
return true;
16831683
}
16841684

1685-
bool PreRARematStage::hasExcessVGPRs(const GCNRegPressure &RP,
1686-
unsigned MaxVGPRs,
1687-
unsigned &ExcessArchVGPRs,
1688-
bool &AGPRLimited) {
1685+
namespace {
1686+
/// Models excess register pressure in a region and tracks our progress as we
1687+
/// identify rematerialization opportunities.
1688+
struct ExcessRP {
1689+
/// Number of excess ArchVGPRs.
1690+
unsigned ArchVGPRs = 0;
1691+
/// Number of excess AGPRs.
1692+
unsigned AGPRs = 0;
1693+
/// For unified register files, number of excess VGPRs.
1694+
unsigned VGPRs = 0;
1695+
/// For unified register files with AGPR usage, number of excess ArchVGPRs to
1696+
/// save before we are able to save a whole allocation granule.
1697+
unsigned ArchVGPRsToAlignment = 0;
1698+
/// Whether the region uses AGPRs.
1699+
bool HasAGPRs = false;
1700+
1701+
/// Constructs the excess RP model; determines the excess pressure w.r.t. a
1702+
/// maximum number of allowed VGPRs.
1703+
ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs);
1704+
1705+
/// Accounts for \p NumRegs saved ArchVGPRs in the model. If \p
1706+
/// UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess
1707+
/// AGPRs once excess ArchVGPR pressure has been eliminated. Returns whether
1708+
/// saving these ArchVGPRs helped reduce excess pressure.
1709+
bool saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill);
1710+
1711+
/// Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving
1712+
/// these ArchVGPRs helped reduce excess pressure.
1713+
bool saveAGPRs(unsigned NumRegs);
1714+
1715+
/// Returns whether there is any excess register pressure.
1716+
operator bool() const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0; }
1717+
1718+
#ifndef NDEBUG
1719+
friend raw_ostream &operator<<(raw_ostream &OS, const ExcessRP &Excess) {
1720+
OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and "
1721+
<< Excess.VGPRs << " VGPRs (next ArchVGPR aligment in "
1722+
<< Excess.ArchVGPRsToAlignment << " registers)\n";
1723+
return OS;
1724+
}
1725+
#endif
1726+
1727+
private:
1728+
static inline bool saveRegs(unsigned &LeftToSave, unsigned &NumRegs) {
1729+
unsigned NumSaved = std::min(LeftToSave, NumRegs);
1730+
NumRegs -= NumSaved;
1731+
LeftToSave -= NumSaved;
1732+
return NumSaved;
1733+
}
1734+
};
1735+
} // namespace
1736+
1737+
ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
1738+
unsigned MaxVGPRs) {
1739+
unsigned NumArchVGPRs = RP.getArchVGPRNum();
16891740
unsigned NumAGPRs = RP.getAGPRNum();
1690-
if (!ST.hasGFX90AInsts() || !NumAGPRs) {
1691-
// Non-unified RF. We can only reduce ArchVGPR excess pressure at this
1692-
// point, but still want to identify when there is AGPR excess pressure.
1693-
bool HasSpill = false;
1694-
unsigned NumArchVGPRs = RP.getArchVGPRNum();
1695-
if (NumArchVGPRs > MaxVGPRs) {
1696-
ExcessArchVGPRs = NumArchVGPRs - MaxVGPRs;
1697-
HasSpill = true;
1698-
}
1699-
if (NumAGPRs > MaxVGPRs) {
1700-
ExcessArchVGPRs = NumArchVGPRs;
1701-
AGPRLimited = true;
1702-
HasSpill = true;
1741+
HasAGPRs = NumAGPRs;
1742+
1743+
if (!ST.hasGFX90AInsts()) {
1744+
// Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
1745+
// independently.
1746+
if (NumArchVGPRs > MaxVGPRs)
1747+
ArchVGPRs = NumArchVGPRs - MaxVGPRs;
1748+
if (NumAGPRs > MaxVGPRs)
1749+
AGPRs = NumAGPRs - MaxVGPRs;
1750+
return;
1751+
}
1752+
1753+
// Independently of whether overall VGPR pressure is under the limit, we still
1754+
// have to check whether ArchVGPR pressure or AGPR pressure alone exceeds the
1755+
// number of addressable registers in each category.
1756+
const unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
1757+
if (NumArchVGPRs > MaxArchVGPRs) {
1758+
ArchVGPRs = NumArchVGPRs - MaxArchVGPRs;
1759+
NumArchVGPRs = MaxArchVGPRs;
1760+
}
1761+
if (NumAGPRs > MaxArchVGPRs) {
1762+
AGPRs = NumAGPRs - MaxArchVGPRs;
1763+
NumAGPRs = MaxArchVGPRs;
1764+
}
1765+
1766+
// Check overall VGPR usage against the limit; any excess above addressable
1767+
// register limits has already been accounted for.
1768+
unsigned NumVGPRs = GCNRegPressure::getUnifiedVGPRNum(NumArchVGPRs, NumAGPRs);
1769+
if (NumVGPRs > MaxVGPRs) {
1770+
VGPRs = NumVGPRs - MaxVGPRs;
1771+
ArchVGPRsToAlignment = NumArchVGPRs - alignDown(NumArchVGPRs, 4);
1772+
if (!ArchVGPRsToAlignment)
1773+
ArchVGPRsToAlignment = 4;
1774+
}
1775+
}
1776+
1777+
bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
1778+
bool Progress = saveRegs(ArchVGPRs, NumRegs);
1779+
1780+
if (HasAGPRs) {
1781+
// ArchVGPRs can only be allocated as a multiple of a granule.
1782+
const unsigned Granule = 4;
1783+
unsigned NumSavedRegs = 0;
1784+
1785+
// Count the number of whole ArchVGPR allocation granules we can save.
1786+
if (unsigned NumGranules = NumRegs / Granule; NumGranules) {
1787+
NumSavedRegs = NumGranules * Granule;
1788+
NumRegs -= NumSavedRegs;
17031789
}
1704-
return HasSpill;
1705-
}
1706-
if (RP.getVGPRNum(true) > MaxVGPRs) {
1707-
// Unified RF. We can only remat ArchVGPRs; AGPR pressure alone may prevent
1708-
// us from eliminating spilling.
1709-
unsigned NumArchVGPRs = RP.getArchVGPRNum();
1710-
if (NumAGPRs >= MaxVGPRs) {
1711-
AGPRLimited = true;
1712-
ExcessArchVGPRs = NumArchVGPRs;
1790+
1791+
// We may be able to save one more whole ArchVGPR allocation granule.
1792+
if (NumRegs >= ArchVGPRsToAlignment) {
1793+
NumSavedRegs += Granule;
1794+
ArchVGPRsToAlignment = Granule - (NumRegs - ArchVGPRsToAlignment);
17131795
} else {
1714-
ExcessArchVGPRs = NumArchVGPRs - alignDown(MaxVGPRs - NumAGPRs, 4);
1796+
ArchVGPRsToAlignment -= NumRegs;
17151797
}
1716-
return true;
1798+
1799+
// Prioritize saving generic VGPRs, then AGPRs if we allow AGPR-to-ArchVGPR
1800+
// spilling and have some free ArchVGPR slots.
1801+
Progress |= saveRegs(VGPRs, NumSavedRegs);
1802+
if (UseArchVGPRForAGPRSpill)
1803+
Progress |= saveRegs(AGPRs, NumSavedRegs);
1804+
} else {
1805+
// No AGPR usage in the region i.e., no allocation granule to worry about.
1806+
Progress |= saveRegs(VGPRs, NumRegs);
17171807
}
1718-
return false;
1808+
1809+
return Progress;
1810+
}
1811+
1812+
bool ExcessRP::saveAGPRs(unsigned NumRegs) {
1813+
return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
17191814
}
17201815

17211816
bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
@@ -1725,10 +1820,9 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
17251820
<< MF.getFunction().getName() << '\n');
17261821

17271822
// Maps optimizable regions (i.e., regions at minimum and VGPR-limited
1728-
// occupancy, or regions with VGPR spilling) to their excess RP.
1729-
DenseMap<unsigned, unsigned> OptRegions;
1823+
// occupancy, or regions with VGPR spilling) to a model of their excess RP.
1824+
DenseMap<unsigned, ExcessRP> OptRegions;
17301825
const Function &F = MF.getFunction();
1731-
const bool UnifiedRF = ST.hasGFX90AInsts();
17321826

17331827
// Adjust workgroup size induced occupancy bounds with the
17341828
// "amdgpu-waves-per-eu" attribute. This should be offloaded to a subtarget
@@ -1774,70 +1868,70 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
17741868
// occupancy by one in the whole function.
17751869
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
17761870
GCNRegPressure &RP = DAG.Pressure[I];
1777-
unsigned ExcessRP = 0;
1778-
unsigned NumSGPRs = RP.getSGPRNum();
17791871

17801872
// Check whether SGPR pressures prevents us from eliminating spilling.
1873+
unsigned NumSGPRs = RP.getSGPRNum();
17811874
if (NumSGPRs > MaxSGPRsNoSpill)
17821875
ClearOptRegionsIf(IncreaseOccupancy);
17831876

1784-
bool OccAGPRLimited = false;
1785-
if (hasExcessVGPRs(RP, MaxVGPRsNoSpill, ExcessRP, OccAGPRLimited)) {
1877+
ExcessRP Excess(ST, RP, MaxVGPRsNoSpill);
1878+
if (Excess) {
17861879
ClearOptRegionsIf(IncreaseOccupancy);
1787-
REMAT_DEBUG({
1788-
if (ExcessRP) {
1789-
StringRef RegClass = UnifiedRF ? "VGPRs" : "ArchVGPRs";
1790-
dbgs() << "Region " << I << " is spilling " << RegClass << ", save "
1791-
<< ExcessRP << " to eliminate " << RegClass << "-spilling\n";
1792-
}
1793-
});
17941880
} else if (IncreaseOccupancy) {
17951881
// Check whether SGPR pressure prevents us from increasing occupancy.
17961882
if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
17971883
if (DAG.MinOccupancy >= OccBounds.first)
17981884
return false;
17991885
continue;
18001886
}
1801-
1802-
if (hasExcessVGPRs(RP, MaxVGPRsIncOcc, ExcessRP, OccAGPRLimited)) {
1803-
// Check whether AGPR pressure prevents us from increasing occupancy.
1804-
if (ClearOptRegionsIf(OccAGPRLimited)) {
1887+
if ((Excess = ExcessRP(ST, RP, MaxVGPRsIncOcc))) {
1888+
// We can only rematerialize ArchVGPRs at this point.
1889+
unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs;
1890+
bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum();
1891+
if (ClearOptRegionsIf(Excess.AGPRs || NotEnoughArchVGPRs)) {
18051892
if (DAG.MinOccupancy >= OccBounds.first)
18061893
return false;
18071894
continue;
18081895
}
1809-
1810-
// Occupancy could be increased by rematerializing ArchVGPRs.
1811-
REMAT_DEBUG({
1812-
if (ExcessRP) {
1813-
StringRef RegClass = UnifiedRF ? "VGPRs" : "ArchVGPRs";
1814-
dbgs() << "Region " << I << " has min. occupancy: save " << ExcessRP
1815-
<< " " << RegClass << " to improve occupancy\n";
1816-
}
1817-
});
18181896
}
18191897
}
1820-
if (ExcessRP)
1821-
OptRegions.insert({I, ExcessRP});
1898+
if (Excess)
1899+
OptRegions.insert({I, Excess});
18221900
}
18231901
if (OptRegions.empty())
18241902
return false;
18251903

1904+
#ifndef DEBUG
1905+
if (IncreaseOccupancy)
1906+
REMAT_DEBUG(dbgs() << "Occupancy minimal in regions:\n");
1907+
else
1908+
REMAT_DEBUG(dbgs() << "Spilling in regions:\n");
1909+
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I)
1910+
if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
1911+
REMAT_DEBUG(dbgs() << " " << I << ": " << OptIt->getSecond() << '\n');
1912+
#endif
1913+
18261914
// When we are reducing spilling, the target is the minimum achievable
1827-
// occupancy implied by workgroup sizes / the "amdgpu-waves-per-eu" attribute.
1915+
// occupancy implied by workgroup sizes / the "amdgpu-waves-per-eu"
1916+
// attribute.
18281917
TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : OccBounds.first;
18291918

18301919
// Accounts for a reduction in RP in an optimizable region. Returns whether we
18311920
// estimate that we have identified enough rematerialization opportunities to
1832-
// achieve our goal.
1833-
auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask) -> bool {
1834-
auto NumRegs = SIRegisterInfo::getNumCoveredRegs(Mask);
1835-
unsigned I = OptIt->getFirst();
1836-
unsigned &Excess = OptIt->getSecond();
1837-
if (NumRegs >= Excess)
1838-
OptRegions.erase(I);
1839-
else
1840-
Excess -= NumRegs;
1921+
// achieve our goal, and sets Progress to true when this particular reduction
1922+
// in pressure was helpful toward that goal.
1923+
auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask,
1924+
bool &Progress) -> bool {
1925+
ExcessRP &Excess = OptIt->getSecond();
1926+
// We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
1927+
// only when we are just trying to eliminate spilling to memory. At this
1928+
// point we err on the conservative side and do not increase
1929+
// register-to-register spilling for the sake of increasing occupancy.
1930+
Progress |=
1931+
Excess.saveArchVGPRs(SIRegisterInfo::getNumCoveredRegs(Mask),
1932+
/*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy);
1933+
if (!Excess)
1934+
OptRegions.erase(OptIt->getFirst());
18411935
return OptRegions.empty();
18421936
};
18431937

@@ -1865,9 +1959,11 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
18651959
continue;
18661960

18671961
// We only care to rematerialize the instruction if it has a single
1868-
// non-debug user in a different block.
1962+
// non-debug user in a different block. The using MI may not belong to a
1963+
// region if it is a lone region terminator.
18691964
MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
1870-
if (!UseMI || DefMI.getParent() == UseMI->getParent())
1965+
auto UseRegion = MIRegion.find(UseMI);
1966+
if (!UseMI || (UseRegion != MIRegion.end() && UseRegion->second == I))
18711967
continue;
18721968

18731969
// Do not rematerialize an instruction if it uses or is used by an
@@ -1901,9 +1997,8 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
19011997
// maximum RP in the region is reached somewhere between the defining
19021998
// instruction and the end of the region.
19031999
REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
1904-
RematUseful = true;
19052000
LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
1906-
if (ReduceRPInRegion(It, Mask))
2001+
if (ReduceRPInRegion(It, Mask, RematUseful))
19072002
return true;
19082003
}
19092004

@@ -1923,8 +2018,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
19232018
// instruction's use.
19242019
if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
19252020
REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n');
1926-
RematUseful = true;
1927-
if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg]))
2021+
if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RematUseful))
19282022
return true;
19292023
}
19302024
}

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -484,13 +484,6 @@ class PreRARematStage : public GCNSchedStage {
484484
/// spilling.
485485
bool IncreaseOccupancy;
486486

487-
/// Determines whether there is excess VGPR pressure in \p RP w.r.t. \p
488-
/// MaxVGPRs. If there is, \p ExcessArchVGPRs is set to the number of
489-
/// ArchVGPRs one must save to eliminate the excess and \p AGPRLimited is set
490-
/// to true if AGPR pressure alone causes an excess.
491-
bool hasExcessVGPRs(const GCNRegPressure &RP, unsigned MaxVGPRs,
492-
unsigned &ExcessArchVGPRs, bool &AGPRLimited);
493-
494487
/// Returns whether remat can reduce spilling or increase function occupancy
495488
/// by 1 through rematerialization. If it can do one, collects instructions in
496489
/// PreRARematStage::Rematerializations and sets the target occupancy in

0 commit comments

Comments
 (0)