@@ -1682,40 +1682,135 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
16821682 return true ;
16831683}
16841684
1685- bool PreRARematStage::hasExcessVGPRs (const GCNRegPressure &RP,
1686- unsigned MaxVGPRs,
1687- unsigned &ExcessArchVGPRs,
1688- bool &AGPRLimited) {
1685+ namespace {
1686+ // / Models excess register pressure in a region and tracks our progress as we
1687+ // / identify rematerialization opportunities.
1688+ struct ExcessRP {
1689+ // / Number of excess ArchVGPRs.
1690+ unsigned ArchVGPRs = 0 ;
1691+ // / Number of excess AGPRs.
1692+ unsigned AGPRs = 0 ;
1693+ // / For unified register files, number of excess VGPRs.
1694+ unsigned VGPRs = 0 ;
1695+ // / For unified register files with AGPR usage, number of excess ArchVGPRs to
1696+ // / save before we are able to save a whole allocation granule.
1697+ unsigned ArchVGPRsToAlignment = 0 ;
1698+ // / Whether the region uses AGPRs.
1699+ bool HasAGPRs = false ;
1700+
1701+ // / Constructs the excess RP model; determines the excess pressure w.r.t. a
1702+ // / maximum number of allowed VGPRs.
1703+ ExcessRP (const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs);
1704+
1705+ // / Accounts for \p NumRegs saved ArchVGPRs in the model. If \p
1706+ // / UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess
1707+ // / AGPRs once excess ArchVGPR pressure has been eliminated. Returns whether
1708+ // / saving these ArchVGPRs helped reduce excess pressure.
1709+ bool saveArchVGPRs (unsigned NumRegs, bool UseArchVGPRForAGPRSpill);
1710+
1711+ // / Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving
1712+ // / these ArchVGPRs helped reduce excess pressure.
1713+ bool saveAGPRs (unsigned NumRegs);
1714+
1715+ // / Returns whether there is any excess register pressure.
1716+ operator bool () const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0 ; }
1717+
1718+ #ifndef NDEBUG
1719+ friend raw_ostream &operator <<(raw_ostream &OS, const ExcessRP &Excess) {
1720+ OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and "
1721+ << Excess.VGPRs << " VGPRs (next ArchVGPR aligment in "
1722+ << Excess.ArchVGPRsToAlignment << " registers)\n " ;
1723+ return OS;
1724+ }
1725+ #endif
1726+
1727+ private:
1728+ static inline bool saveRegs (unsigned &LeftToSave, unsigned &NumRegs) {
1729+ unsigned NumSaved = std::min (LeftToSave, NumRegs);
1730+ NumRegs -= NumSaved;
1731+ LeftToSave -= NumSaved;
1732+ return NumSaved;
1733+ }
1734+ };
1735+ } // namespace
1736+
1737+ ExcessRP::ExcessRP (const GCNSubtarget &ST, const GCNRegPressure &RP,
1738+ unsigned MaxVGPRs) {
1739+ unsigned NumArchVGPRs = RP.getArchVGPRNum ();
16891740 unsigned NumAGPRs = RP.getAGPRNum ();
1690- if (!ST.hasGFX90AInsts () || !NumAGPRs) {
1691- // Non-unified RF. We can only reduce ArchVGPR excess pressure at this
1692- // point, but still want to identify when there is AGPR excess pressure.
1693- bool HasSpill = false ;
1694- unsigned NumArchVGPRs = RP.getArchVGPRNum ();
1695- if (NumArchVGPRs > MaxVGPRs) {
1696- ExcessArchVGPRs = NumArchVGPRs - MaxVGPRs;
1697- HasSpill = true ;
1698- }
1699- if (NumAGPRs > MaxVGPRs) {
1700- ExcessArchVGPRs = NumArchVGPRs;
1701- AGPRLimited = true ;
1702- HasSpill = true ;
1741+ HasAGPRs = NumAGPRs;
1742+
1743+ if (!ST.hasGFX90AInsts ()) {
1744+ // Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
1745+ // independently.
1746+ if (NumArchVGPRs > MaxVGPRs)
1747+ ArchVGPRs = NumArchVGPRs - MaxVGPRs;
1748+ if (NumAGPRs > MaxVGPRs)
1749+ AGPRs = NumAGPRs - MaxVGPRs;
1750+ return ;
1751+ }
1752+
1753+ // Independently of whether overall VGPR pressure is under the limit, we still
1754+ // have to check whether ArchVGPR pressure or AGPR pressure alone exceeds the
1755+ // number of addressable registers in each category.
1756+ const unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs ();
1757+ if (NumArchVGPRs > MaxArchVGPRs) {
1758+ ArchVGPRs = NumArchVGPRs - MaxArchVGPRs;
1759+ NumArchVGPRs = MaxArchVGPRs;
1760+ }
1761+ if (NumAGPRs > MaxArchVGPRs) {
1762+ AGPRs = NumAGPRs - MaxArchVGPRs;
1763+ NumAGPRs = MaxArchVGPRs;
1764+ }
1765+
1766+ // Check overall VGPR usage against the limit; any excess above addressable
1767+ // register limits has already been accounted for.
1768+ unsigned NumVGPRs = GCNRegPressure::getUnifiedVGPRNum (NumArchVGPRs, NumAGPRs);
1769+ if (NumVGPRs > MaxVGPRs) {
1770+ VGPRs = NumVGPRs - MaxVGPRs;
1771+ ArchVGPRsToAlignment = NumArchVGPRs - alignDown (NumArchVGPRs, 4 );
1772+ if (!ArchVGPRsToAlignment)
1773+ ArchVGPRsToAlignment = 4 ;
1774+ }
1775+ }
1776+
1777+ bool ExcessRP::saveArchVGPRs (unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
1778+ bool Progress = saveRegs (ArchVGPRs, NumRegs);
1779+
1780+ if (HasAGPRs) {
1781+ // ArchVGPRs can only be allocated as a multiple of a granule.
1782+ const unsigned Granule = 4 ;
1783+ unsigned NumSavedRegs = 0 ;
1784+
1785+ // Count the number of whole ArchVGPR allocation granules we can save.
1786+ if (unsigned NumGranules = NumRegs / Granule; NumGranules) {
1787+ NumSavedRegs = NumGranules * Granule;
1788+ NumRegs -= NumSavedRegs;
17031789 }
1704- return HasSpill;
1705- }
1706- if (RP.getVGPRNum (true ) > MaxVGPRs) {
1707- // Unified RF. We can only remat ArchVGPRs; AGPR pressure alone may prevent
1708- // us from eliminating spilling.
1709- unsigned NumArchVGPRs = RP.getArchVGPRNum ();
1710- if (NumAGPRs >= MaxVGPRs) {
1711- AGPRLimited = true ;
1712- ExcessArchVGPRs = NumArchVGPRs;
1790+
1791+ // We may be able to save one more whole ArchVGPR allocation granule.
1792+ if (NumRegs >= ArchVGPRsToAlignment) {
1793+ NumSavedRegs += Granule;
1794+ ArchVGPRsToAlignment = Granule - (NumRegs - ArchVGPRsToAlignment);
17131795 } else {
1714- ExcessArchVGPRs = NumArchVGPRs - alignDown (MaxVGPRs - NumAGPRs, 4 ) ;
1796+ ArchVGPRsToAlignment -= NumRegs ;
17151797 }
1716- return true ;
1798+
1799+ // Prioritize saving generic VGPRs, then AGPRs if we allow AGPR-to-ArchVGPR
1800+ // spilling and have some free ArchVGPR slots.
1801+ Progress |= saveRegs (VGPRs, NumSavedRegs);
1802+ if (UseArchVGPRForAGPRSpill)
1803+ Progress |= saveRegs (AGPRs, NumSavedRegs);
1804+ } else {
1805+ // No AGPR usage in the region i.e., no allocation granule to worry about.
1806+ Progress |= saveRegs (VGPRs, NumRegs);
17171807 }
1718- return false ;
1808+
1809+ return Progress;
1810+ }
1811+
1812+ bool ExcessRP::saveAGPRs (unsigned NumRegs) {
1813+ return saveRegs (AGPRs, NumRegs) || saveRegs (VGPRs, NumRegs);
17191814}
17201815
17211816bool PreRARematStage::canIncreaseOccupancyOrReduceSpill () {
@@ -1725,10 +1820,9 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
17251820 << MF.getFunction ().getName () << ' \n ' );
17261821
17271822 // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
1728- // occupancy, or regions with VGPR spilling) to their excess RP.
1729- DenseMap<unsigned , unsigned > OptRegions;
1823+ // occupancy, or regions with VGPR spilling) to a model of their excess RP.
1824+ DenseMap<unsigned , ExcessRP > OptRegions;
17301825 const Function &F = MF.getFunction ();
1731- const bool UnifiedRF = ST.hasGFX90AInsts ();
17321826
17331827 // Adjust workgroup size induced occupancy bounds with the
17341828 // "amdgpu-waves-per-eu" attribute. This should be offloaded to a subtarget
@@ -1774,70 +1868,70 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
17741868 // occupancy by one in the whole function.
17751869 for (unsigned I = 0 , E = DAG.Regions .size (); I != E; ++I) {
17761870 GCNRegPressure &RP = DAG.Pressure [I];
1777- unsigned ExcessRP = 0 ;
1778- unsigned NumSGPRs = RP.getSGPRNum ();
17791871
17801872 // Check whether SGPR pressures prevents us from eliminating spilling.
1873+ unsigned NumSGPRs = RP.getSGPRNum ();
17811874 if (NumSGPRs > MaxSGPRsNoSpill)
17821875 ClearOptRegionsIf (IncreaseOccupancy);
17831876
1784- bool OccAGPRLimited = false ;
1785- if (hasExcessVGPRs (RP, MaxVGPRsNoSpill, ExcessRP, OccAGPRLimited) ) {
1877+ ExcessRP Excess (ST, RP, MaxVGPRsNoSpill) ;
1878+ if (Excess ) {
17861879 ClearOptRegionsIf (IncreaseOccupancy);
1787- REMAT_DEBUG ({
1788- if (ExcessRP) {
1789- StringRef RegClass = UnifiedRF ? " VGPRs" : " ArchVGPRs" ;
1790- dbgs () << " Region " << I << " is spilling " << RegClass << " , save "
1791- << ExcessRP << " to eliminate " << RegClass << " -spilling\n " ;
1792- }
1793- });
17941880 } else if (IncreaseOccupancy) {
17951881 // Check whether SGPR pressure prevents us from increasing occupancy.
17961882 if (ClearOptRegionsIf (NumSGPRs > MaxSGPRsIncOcc)) {
17971883 if (DAG.MinOccupancy >= OccBounds.first )
17981884 return false ;
17991885 continue ;
18001886 }
1801-
1802- if (hasExcessVGPRs (RP, MaxVGPRsIncOcc, ExcessRP, OccAGPRLimited)) {
1803- // Check whether AGPR pressure prevents us from increasing occupancy.
1804- if (ClearOptRegionsIf (OccAGPRLimited)) {
1887+ if ((Excess = ExcessRP (ST, RP, MaxVGPRsIncOcc))) {
1888+ // We can only rematerialize ArchVGPRs at this point.
1889+ unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs ;
1890+ bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum ();
1891+ if (ClearOptRegionsIf (Excess.AGPRs || NotEnoughArchVGPRs)) {
18051892 if (DAG.MinOccupancy >= OccBounds.first )
18061893 return false ;
18071894 continue ;
18081895 }
1809-
1810- // Occupancy could be increased by rematerializing ArchVGPRs.
1811- REMAT_DEBUG ({
1812- if (ExcessRP) {
1813- StringRef RegClass = UnifiedRF ? " VGPRs" : " ArchVGPRs" ;
1814- dbgs () << " Region " << I << " has min. occupancy: save " << ExcessRP
1815- << " " << RegClass << " to improve occupancy\n " ;
1816- }
1817- });
18181896 }
18191897 }
1820- if (ExcessRP )
1821- OptRegions.insert ({I, ExcessRP });
1898+ if (Excess )
1899+ OptRegions.insert ({I, Excess });
18221900 }
18231901 if (OptRegions.empty ())
18241902 return false ;
18251903
1904+ #ifndef DEBUG
1905+ if (IncreaseOccupancy)
1906+ REMAT_DEBUG (dbgs () << " Occupancy minimal in regions:\n " );
1907+ else
1908+ REMAT_DEBUG (dbgs () << " Spilling in regions:\n " );
1909+ for (unsigned I = 0 , E = DAG.Regions .size (); I != E; ++I)
1910+ if (auto OptIt = OptRegions.find (I); OptIt != OptRegions.end ())
1911+ REMAT_DEBUG (dbgs () << " " << I << " : " << OptIt->getSecond () << ' \n ' );
1912+ #endif
1913+
18261914 // When we are reducing spilling, the target is the minimum achievable
1827- // occupancy implied by workgroup sizes / the "amdgpu-waves-per-eu" attribute.
1915+ // occupancy implied by workgroup sizes / the "amdgpu-waves-per-eu"
1916+ // attribute.
18281917 TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : OccBounds.first ;
18291918
18301919 // Accounts for a reduction in RP in an optimizable region. Returns whether we
18311920 // estimate that we have identified enough rematerialization opportunities to
1832- // achieve our goal.
1833- auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask) -> bool {
1834- auto NumRegs = SIRegisterInfo::getNumCoveredRegs (Mask);
1835- unsigned I = OptIt->getFirst ();
1836- unsigned &Excess = OptIt->getSecond ();
1837- if (NumRegs >= Excess)
1838- OptRegions.erase (I);
1839- else
1840- Excess -= NumRegs;
1921+ // achieve our goal, and sets Progress to true when this particular reduction
1922+ // in pressure was helpful toward that goal.
1923+ auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask,
1924+ bool &Progress) -> bool {
1925+ ExcessRP &Excess = OptIt->getSecond ();
1926+ // We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
1927+ // only when we are just trying to eliminate spilling to memory. At this
1928+ // point we err on the conservative side and do not increase
1929+ // register-to-register spilling for the sake of increasing occupancy.
1930+ Progress |=
1931+ Excess.saveArchVGPRs (SIRegisterInfo::getNumCoveredRegs (Mask),
1932+ /* UseArchVGPRForAGPRSpill=*/ !IncreaseOccupancy);
1933+ if (!Excess)
1934+ OptRegions.erase (OptIt->getFirst ());
18411935 return OptRegions.empty ();
18421936 };
18431937
@@ -1865,9 +1959,11 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
18651959 continue ;
18661960
18671961 // We only care to rematerialize the instruction if it has a single
1868- // non-debug user in a different block.
1962+ // non-debug user in a different block. The using MI may not belong to a
1963+ // region if it is a lone region terminator.
18691964 MachineInstr *UseMI = DAG.MRI .getOneNonDBGUser (Reg);
1870- if (!UseMI || DefMI.getParent () == UseMI->getParent ())
1965+ auto UseRegion = MIRegion.find (UseMI);
1966+ if (!UseMI || (UseRegion != MIRegion.end () && UseRegion->second == I))
18711967 continue ;
18721968
18731969 // Do not rematerialize an instruction if it uses or is used by an
@@ -1901,9 +1997,8 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
19011997 // maximum RP in the region is reached somewhere between the defining
19021998 // instruction and the end of the region.
19031999 REMAT_DEBUG (dbgs () << " Defining region is optimizable\n " );
1904- RematUseful = true ;
19052000 LaneBitmask Mask = DAG.RegionLiveOuts .getLiveRegsForRegionIdx (I)[Reg];
1906- if (ReduceRPInRegion (It, Mask))
2001+ if (ReduceRPInRegion (It, Mask, RematUseful ))
19072002 return true ;
19082003 }
19092004
@@ -1923,8 +2018,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
19232018 // instruction's use.
19242019 if (auto It = OptRegions.find (LIRegion); It != OptRegions.end ()) {
19252020 REMAT_DEBUG (dbgs () << " Live-in in region " << LIRegion << ' \n ' );
1926- RematUseful = true ;
1927- if (ReduceRPInRegion (It, DAG.LiveIns [LIRegion][Reg]))
2021+ if (ReduceRPInRegion (It, DAG.LiveIns [LIRegion][Reg], RematUseful))
19282022 return true ;
19292023 }
19302024 }
0 commit comments