Skip to content

Commit fab547a

Browse files
committed
Revert to use of default waves/EU calculation & Fix progress detection
- Removes the custom logic for determining waves/EU bounds and use the default subtarget method. - Saving ArchVGPRs without crossing an ArchVGPR granule is still progress. - Some test churn.
1 parent 799cfed commit fab547a

File tree

4 files changed

+305
-412
lines changed

4 files changed

+305
-412
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 34 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1711,6 +1711,11 @@ struct ExcessRP {
17111711
unsigned ArchVGPRsToAlignment = 0;
17121712
/// Whether the region uses AGPRs.
17131713
bool HasAGPRs = false;
1714+
/// Whether the subtarget has a unified RF.
1715+
bool UnifiedRF;
1716+
1717+
/// ArchVGPR allocation granule for unified RFs with AGPR usage.
1718+
static const unsigned Granule = 4;
17141719

17151720
/// Constructs the excess RP model; determines the excess pressure w.r.t. a
17161721
/// maximum number of allowed VGPRs.
@@ -1749,12 +1754,13 @@ struct ExcessRP {
17491754
} // namespace
17501755

17511756
ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
1752-
unsigned MaxVGPRs) {
1757+
unsigned MaxVGPRs)
1758+
: UnifiedRF(ST.hasGFX90AInsts()) {
17531759
unsigned NumArchVGPRs = RP.getArchVGPRNum();
17541760
unsigned NumAGPRs = RP.getAGPRNum();
17551761
HasAGPRs = NumAGPRs;
17561762

1757-
if (!ST.hasGFX90AInsts()) {
1763+
if (!UnifiedRF) {
17581764
// Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
17591765
// independently.
17601766
if (NumArchVGPRs > MaxVGPRs)
@@ -1782,18 +1788,27 @@ ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
17821788
unsigned NumVGPRs = GCNRegPressure::getUnifiedVGPRNum(NumArchVGPRs, NumAGPRs);
17831789
if (NumVGPRs > MaxVGPRs) {
17841790
VGPRs = NumVGPRs - MaxVGPRs;
1785-
ArchVGPRsToAlignment = NumArchVGPRs - alignDown(NumArchVGPRs, 4);
1791+
ArchVGPRsToAlignment = NumArchVGPRs - alignDown(NumArchVGPRs, Granule);
17861792
if (!ArchVGPRsToAlignment)
1787-
ArchVGPRsToAlignment = 4;
1793+
ArchVGPRsToAlignment = Granule;
17881794
}
17891795
}
17901796

17911797
bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
17921798
bool Progress = saveRegs(ArchVGPRs, NumRegs);
1799+
if (!NumRegs)
1800+
return Progress;
17931801

1794-
if (HasAGPRs) {
1795-
// ArchVGPRs can only be allocated as a multiple of a granule.
1796-
const unsigned Granule = 4;
1802+
if (!UnifiedRF) {
1803+
if (UseArchVGPRForAGPRSpill)
1804+
Progress |= saveRegs(AGPRs, NumRegs);
1805+
} else if (HasAGPRs && (VGPRs || (UseArchVGPRForAGPRSpill && AGPRs))) {
1806+
// There is progress as long as there are VGPRs left to save, even if the
1807+
// save induced by this particular call does not cross an ArchVGPR alignment
1808+
// barrier.
1809+
Progress = true;
1810+
1811+
// ArchVGPRs can only be allocated as a multiple of a granule in unified RF.
17971812
unsigned NumSavedRegs = 0;
17981813

17991814
// Count the number of whole ArchVGPR allocation granules we can save.
@@ -1812,9 +1827,9 @@ bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
18121827

18131828
// Prioritize saving generic VGPRs, then AGPRs if we allow AGPR-to-ArchVGPR
18141829
// spilling and have some free ArchVGPR slots.
1815-
Progress |= saveRegs(VGPRs, NumSavedRegs);
1830+
saveRegs(VGPRs, NumSavedRegs);
18161831
if (UseArchVGPRForAGPRSpill)
1817-
Progress |= saveRegs(AGPRs, NumSavedRegs);
1832+
saveRegs(AGPRs, NumSavedRegs);
18181833
} else {
18191834
// No AGPR usage in the region i.e., no allocation granule to worry about.
18201835
Progress |= saveRegs(VGPRs, NumRegs);
@@ -1838,35 +1853,16 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
18381853
DenseMap<unsigned, ExcessRP> OptRegions;
18391854
const Function &F = MF.getFunction();
18401855

1841-
// Adjust workgroup size induced occupancy bounds with the
1842-
// "amdgpu-waves-per-eu" attribute. This should be offloaded to a subtarget
1843-
// method, but at this point is if unclear how other parts of the codebase
1844-
// interpret this attribute and the default behavior produces unexpected
1845-
// bounds. Here we want to allow users to ask for target occupancies lower
1846-
// than the default lower bound.
1847-
std::pair<unsigned, unsigned> OccBounds =
1848-
ST.getOccupancyWithWorkGroupSizes(MF);
1849-
std::pair<unsigned, unsigned> WavesPerEU =
1850-
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1851-
if (WavesPerEU.first <= WavesPerEU.second) {
1852-
if (WavesPerEU.first && WavesPerEU.first <= OccBounds.second)
1853-
OccBounds.first = WavesPerEU.first;
1854-
if (WavesPerEU.second)
1855-
OccBounds.second = std::min(OccBounds.second, WavesPerEU.second);
1856-
}
1857-
1858-
// We call the "base max functions" directly because otherwise it uses the
1859-
// subtarget's logic for combining "amdgpu-waves-per-eu" with the function's
1860-
// groupsize induced occupancy bounds, producing unexpected results.
1856+
std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
18611857
const unsigned MaxSGPRsNoSpill = ST.getBaseMaxNumSGPRs(
1862-
F, OccBounds, ST.getMaxNumPreloadedSGPRs(), ST.getReservedNumSGPRs(F));
1858+
F, WavesPerEU, ST.getMaxNumPreloadedSGPRs(), ST.getReservedNumSGPRs(F));
18631859
const unsigned MaxVGPRsNoSpill =
1864-
ST.getBaseMaxNumVGPRs(F, {ST.getMinNumVGPRs(OccBounds.second),
1865-
ST.getMaxNumVGPRs(OccBounds.first)});
1860+
ST.getBaseMaxNumVGPRs(F, {ST.getMinNumVGPRs(WavesPerEU.second),
1861+
ST.getMaxNumVGPRs(WavesPerEU.first)});
18661862
const unsigned MaxSGPRsIncOcc =
18671863
ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
18681864
const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
1869-
IncreaseOccupancy = OccBounds.second > DAG.MinOccupancy;
1865+
IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
18701866

18711867
auto ClearOptRegionsIf = [&](bool Cond) -> bool {
18721868
if (Cond) {
@@ -1894,7 +1890,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
18941890
} else if (IncreaseOccupancy) {
18951891
// Check whether SGPR pressure prevents us from increasing occupancy.
18961892
if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
1897-
if (DAG.MinOccupancy >= OccBounds.first)
1893+
if (DAG.MinOccupancy >= WavesPerEU.first)
18981894
return false;
18991895
continue;
19001896
}
@@ -1903,7 +1899,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
19031899
unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs;
19041900
bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum();
19051901
if (ClearOptRegionsIf(Excess.AGPRs || NotEnoughArchVGPRs)) {
1906-
if (DAG.MinOccupancy >= OccBounds.first)
1902+
if (DAG.MinOccupancy >= WavesPerEU.first)
19071903
return false;
19081904
continue;
19091905
}
@@ -1925,10 +1921,9 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
19251921
REMAT_DEBUG(dbgs() << " " << I << ": " << OptIt->getSecond() << '\n');
19261922
#endif
19271923

1928-
// When we are reducing spilling, the target is the minimum achievable
1929-
// occupancy implied by workgroup sizes / the "amdgpu-waves-per-eu"
1930-
// attribute.
1931-
TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : OccBounds.first;
1924+
// When we are reducing spilling, the target is the minimum target number of
1925+
// waves/EU determined by the subtarget.
1926+
TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : WavesPerEU.first;
19321927

19331928
// Accounts for a reduction in RP in an optimizable region. Returns whether we
19341929
// estimate that we have identified enough rematerialization opportunities to

llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll

Lines changed: 40 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -444,77 +444,65 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspa
444444
; GISEL-GFX942-NEXT: s_mov_b32 s2, s7
445445
; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
446446
; GISEL-GFX942-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3]
447-
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s16
447+
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x2000
448+
; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16
448449
; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop
449450
; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
450-
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, s0, v0
451-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen
452-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16
453-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32
454-
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0
455-
; GISEL-GFX942-NEXT: v_add_u32_e32 v0, 0x100, v0
451+
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1
452+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
453+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
454+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
455+
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
456+
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
457+
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
456458
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
457459
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse
458460
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse
459461
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse
460462
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse
461-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48
462-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64
463-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80
464-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v1, s[8:11], 0 offen offset:96
465-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v1, s[8:11], 0 offen offset:112
466-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v1, s[8:11], 0 offen offset:128
467-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v1, s[8:11], 0 offen offset:144
468-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v1, s[8:11], 0 offen offset:160
469-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v1, s[8:11], 0 offen offset:176
470-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192
471-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
472-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
473-
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:240
463+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48
464+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64
465+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80
466+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[26:29], v62, s[8:11], 0 offen offset:96
467+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[30:33], v62, s[8:11], 0 offen offset:112
468+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[34:37], v62, s[8:11], 0 offen offset:128
469+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[38:41], v62, s[8:11], 0 offen offset:144
470+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[42:45], v62, s[8:11], 0 offen offset:160
471+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[46:49], v62, s[8:11], 0 offen offset:176
472+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
473+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
474+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
475+
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:240
474476
; GISEL-GFX942-NEXT: s_nop 0
475-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
476-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
477-
; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, 0x2000
477+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
478+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
479+
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2)
480+
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
478481
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
479482
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse
480483
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse
481484
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse
482-
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
483-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:32
484-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
485-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
486-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
487-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
488-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
489-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
490-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
491-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v62, s[4:7], 0 offen offset:96
492-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
493-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v62, s[4:7], 0 offen offset:112
494-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
495-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v62, s[4:7], 0 offen offset:128
496-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
497-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v62, s[4:7], 0 offen offset:144
498-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
499-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v62, s[4:7], 0 offen offset:160
500-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
501-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v62, s[4:7], 0 offen offset:176
502-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
503-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
504-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
505-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
506-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
507-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
508-
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15)
509-
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse
510485
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse
511486
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse
512487
; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse
488+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:32
489+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
490+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
491+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
492+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
493+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
494+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
495+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
496+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
497+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
498+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
499+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
500+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
513501
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse
514502
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse
515503
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse
516504
; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse
517-
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
505+
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
518506
; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB0_1
519507
; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split
520508
; GISEL-GFX942-NEXT: s_endpgm

0 commit comments

Comments
 (0)