@@ -1675,6 +1675,41 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
16751675 return true ;
16761676}
16771677
1678+ bool PreRARematStage::hasExcessVGPRs (const GCNRegPressure &RP,
1679+ unsigned MaxVGPRs,
1680+ unsigned &ExcessArchVGPRs,
1681+ bool &AGPRLimited) {
1682+ unsigned NumAGPRs = RP.getAGPRNum ();
1683+ if (!ST.hasGFX90AInsts () || !NumAGPRs) {
1684+ // Non-unified RF. We can only reduce ArchVGPR excess pressure at this
1685+ // point, but still want to identify when there is AGPR excess pressure.
1686+ bool HasSpill = false ;
1687+ unsigned NumArchVGPRs = RP.getArchVGPRNum ();
1688+ if (NumArchVGPRs > MaxVGPRs) {
1689+ ExcessArchVGPRs = NumArchVGPRs - MaxVGPRs;
1690+ HasSpill = true ;
1691+ }
1692+ if (NumAGPRs > MaxVGPRs) {
1693+ AGPRLimited = true ;
1694+ HasSpill = true ;
1695+ }
1696+ return HasSpill;
1697+ }
1698+ if (RP.getVGPRNum (true ) > MaxVGPRs) {
1699+ // Unified RF. We can only remat ArchVGPRs; AGPR pressure alone may prevent
1700+ // us from eliminating spilling.
1701+ unsigned NumArchVGPRs = RP.getArchVGPRNum ();
1702+ if (NumAGPRs >= MaxVGPRs) {
1703+ AGPRLimited = true ;
1704+ ExcessArchVGPRs = NumArchVGPRs;
1705+ } else {
1706+ ExcessArchVGPRs = NumArchVGPRs - alignDown (MaxVGPRs - NumAGPRs, 4 );
1707+ }
1708+ return true ;
1709+ }
1710+ return false ;
1711+ }
1712+
16781713bool PreRARematStage::canIncreaseOccupancyOrReduceSpill () {
16791714 const SIRegisterInfo *SRI = static_cast <const SIRegisterInfo *>(DAG.TRI );
16801715
@@ -1684,51 +1719,86 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
16841719 // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
16851720 // occupancy, or regions with VGPR spilling) to their excess RP.
16861721 DenseMap<unsigned , unsigned > OptRegions;
1687-
1688- // Note that the maximum number of VGPRs to use to eliminate spill may be
1689- // lower than the maximum number to increase occupancy when the function has
1690- // the "amdgpu-num-vgpr" attribute.
1691- const std::pair<unsigned , unsigned > OccBounds =
1722+ const Function &F = MF.getFunction ();
1723+ const bool UnifiedRF = ST.hasGFX90AInsts ();
1724+
1725+ // Adjust workgroup size induced occupancy bounds with the
1726+ // "amdgpu-waves-per-eu" attribute. This should be offloaded to a subtarget
1727+ // method, but at this point is if unclear how other parts of the codebase
1728+ // interpret this attribute and the default behavior produces unexpected
1729+ // bounds. Here we want to allow users to ask for target occupancies lower
1730+ // than the default lower bound.
1731+ std::pair<unsigned , unsigned > OccBounds =
16921732 ST.getOccupancyWithWorkGroupSizes (MF);
1693- // FIXME: we should be able to just call ST.getMaxNumArchVGPRs() but that
1694- // would use the occupancy bounds as determined by
1695- // MF.getFunction().getWavesPerEU(), which look incorrect in some cases.
1733+ std::pair<unsigned , unsigned > WavesPerEU =
1734+ AMDGPU::getIntegerPairAttribute (F, " amdgpu-waves-per-eu" , {0 , 0 }, true );
1735+ if (WavesPerEU.first <= WavesPerEU.second ) {
1736+ if (WavesPerEU.first && WavesPerEU.first <= OccBounds.second )
1737+ OccBounds.first = WavesPerEU.first ;
1738+ if (WavesPerEU.second )
1739+ OccBounds.second = std::min (OccBounds.second , WavesPerEU.second );
1740+ }
1741+
1742+ // We call the "base max functions" directly because otherwise it uses the
1743+ // subtarget's logic for combining "amdgpu-waves-per-eu" with the function's
1744+ // groupsize induced occupancy bounds, producing unexpected results.
1745+ const unsigned MaxSGPRsNoSpill = ST.getBaseMaxNumSGPRs (
1746+ F, OccBounds, ST.getMaxNumPreloadedSGPRs (), ST.getReservedNumSGPRs (F));
16961747 const unsigned MaxVGPRsNoSpill =
1697- ST.getBaseMaxNumVGPRs (MF.getFunction (),
1698- {ST.getMinNumArchVGPRs (OccBounds.second ),
1699- ST.getMaxNumArchVGPRs (OccBounds.first )},
1700- false );
1701- const unsigned MaxVGPRsIncOcc = ST.getMaxNumArchVGPRs (DAG.MinOccupancy + 1 );
1748+ ST.getBaseMaxNumVGPRs (F, {ST.getMinNumVGPRs (OccBounds.second ),
1749+ ST.getMaxNumVGPRs (OccBounds.first )});
1750+ const unsigned MaxSGPRsMinOcc = ST.getMaxNumSGPRs (DAG.MinOccupancy , false );
1751+ const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs (DAG.MinOccupancy + 1 );
17021752 IncreaseOccupancy = OccBounds.second > DAG.MinOccupancy ;
17031753
1754+ auto ClearOptRegionsIf = [&](bool Cond) -> bool {
1755+ if (Cond) {
1756+ // We won't try to increase occupancy.
1757+ IncreaseOccupancy = false ;
1758+ OptRegions.clear ();
1759+ }
1760+ return Cond;
1761+ };
1762+
17041763 // Collect optimizable regions. If there is spilling in any region we will
1705- // just try to reduce it. Otherwise we will try to increase occupancy by one.
1764+ // just try to reduce ArchVGPR spilling. Otherwise we will try to increase
1765+ // occupancy by one in the whole function.
17061766 for (unsigned I = 0 , E = DAG.Regions .size (); I != E; ++I) {
17071767 GCNRegPressure &RP = DAG.Pressure [I];
1708- unsigned NumVGPRs = RP.getArchVGPRNum ();
17091768 unsigned ExcessRP = 0 ;
1710- if (NumVGPRs > MaxVGPRsNoSpill) {
1711- if (IncreaseOccupancy) {
1712- // We won't try to increase occupancy.
1713- IncreaseOccupancy = false ;
1714- OptRegions.clear ();
1715- }
1716- // Region has VGPR spilling, we will try to reduce spilling as much as
1717- // possible.
1718- ExcessRP = NumVGPRs - MaxVGPRsNoSpill;
1719- REMAT_DEBUG (dbgs () << " Region " << I << " is spilling VGPRs, save "
1720- << ExcessRP << " VGPR(s) to eliminate spilling\n " );
1769+ unsigned NumSGPRs = RP.getSGPRNum ();
1770+
1771+ // Check whether SGPR pressures prevents us from eliminating spilling.
1772+ if (NumSGPRs > MaxSGPRsNoSpill)
1773+ ClearOptRegionsIf (IncreaseOccupancy);
1774+
1775+ bool OccAGPRLimited = false ;
1776+ if (hasExcessVGPRs (RP, MaxVGPRsNoSpill, ExcessRP, OccAGPRLimited)) {
1777+ ClearOptRegionsIf (IncreaseOccupancy);
1778+ REMAT_DEBUG ({
1779+ if (ExcessRP) {
1780+ StringRef RegClass = UnifiedRF ? " VGPRs" : " ArchVGPRs" ;
1781+ dbgs () << " Region " << I << " is spilling " << RegClass << " , save "
1782+ << ExcessRP << " to eliminate " << RegClass << " -spilling\n " ;
1783+ }
1784+ });
17211785 } else if (IncreaseOccupancy) {
1722- if (ST.getOccupancyWithNumSGPRs (RP.getSGPRNum ()) == DAG.MinOccupancy ) {
1723- // Occupancy is SGPR-limited in the region, no point in trying to
1724- // increase it through VGPR usage.
1725- IncreaseOccupancy = false ;
1726- OptRegions.clear ();
1727- } else if (NumVGPRs > MaxVGPRsIncOcc) {
1728- // Occupancy is VGPR-limited.
1729- ExcessRP = NumVGPRs - MaxVGPRsIncOcc;
1730- REMAT_DEBUG (dbgs () << " Region " << I << " has min. occupancy: save "
1731- << ExcessRP << " VGPR(s) to improve occupancy\n " );
1786+ // Check whether SGPR pressure prevents us from increasing occupancy.
1787+ if (ClearOptRegionsIf (NumSGPRs > MaxSGPRsMinOcc))
1788+ continue ;
1789+
1790+ if (hasExcessVGPRs (RP, MaxVGPRsIncOcc, ExcessRP, OccAGPRLimited)) {
1791+ // Check whether AGPR pressure prevents us from increasing occupancy.
1792+ if (ClearOptRegionsIf (OccAGPRLimited))
1793+ continue ;
1794+ // Occupancy could be increased by rematerializing ArchVGPRs.
1795+ REMAT_DEBUG ({
1796+ if (ExcessRP) {
1797+ StringRef RegClass = UnifiedRF ? " VGPRs" : " ArchVGPRs" ;
1798+ dbgs () << " Region " << I << " has min. occupancy: save " << ExcessRP
1799+ << " " << RegClass << " to improve occupancy\n " ;
1800+ }
1801+ });
17321802 }
17331803 }
17341804 if (ExcessRP)
@@ -1738,7 +1808,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
17381808 return false ;
17391809
17401810 // When we are reducing spilling, the target is the minimum achievable
1741- // occupancy implied by workgroup sizes.
1811+ // occupancy implied by workgroup sizes / the "amdgpu-waves-per-eu" attribute .
17421812 TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : OccBounds.first ;
17431813
17441814 // Accounts for a reduction in RP in an optimizable region. Returns whether we
@@ -2058,9 +2128,11 @@ static MachineBasicBlock *getRegionMBB(MachineFunction &MF,
20582128void PreRARematStage::finalizeGCNSchedStage () {
20592129 // We consider that reducing spilling is always beneficial so we never
20602130 // rollback rematerializations in such cases. It's also possible that
2061- // rescheduling lowers occupancy over the one achived just through remats, in
2062- // which case we do not want to rollback either.
2063- if (!IncreaseOccupancy || AchievedOcc == TargetOcc)
2131+ // rescheduling lowers occupancy over the one achieved just through remats, in
2132+ // which case we do not want to rollback either (the rescheduling was already
2133+ // reverted in PreRARematStage::shouldRevertScheduling in such cases).
2134+ unsigned MaxOcc = std::max (AchievedOcc, DAG.MinOccupancy );
2135+ if (!IncreaseOccupancy || MaxOcc == TargetOcc)
20642136 return ;
20652137
20662138 REMAT_DEBUG (dbgs () << " Rollbacking all rematerializations\n " );
@@ -2077,10 +2149,7 @@ void PreRARematStage::finalizeGCNSchedStage() {
20772149
20782150 // Re-rematerialize MI at the end of its original region. Note that it may
20792151 // not be rematerialized exactly in the same position as originally within
2080- // the region, but it should not matter much. Since we are only
2081- // rematerializing instructions that do not have any virtual reg uses, we
2082- // do not need to call LiveRangeEdit::allUsesAvailableAt() and
2083- // LiveRangeEdit::canRematerializeAt().
2152+ // the region, but it should not matter much.
20842153 TII->reMaterialize (*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI );
20852154 MachineInstr *NewMI = &*std::prev (InsertPos);
20862155 NewMI->getOperand (0 ).setSubReg (SubReg);
0 commit comments