Skip to content

Commit 0f781b6

Browse files
committed
[AMDGPU][SIInsertWaitCnts] Refactor xcnt optimization
Refactor the XCnt optimization checks so that they can be checked when applying a pre-existing waitcnt. This has the effect of removing unnecessary xcnt waits when taking a loop backedge.
1 parent bb14b83 commit 0f781b6

File tree

5 files changed

+51
-71
lines changed

5 files changed

+51
-71
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -646,6 +646,8 @@ class WaitcntBrackets {
646646

647647
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
648648
void applyWaitcnt(InstCounterType T, unsigned Count);
649+
bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
650+
bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
649651
void applyXcnt(const AMDGPU::Waitcnt &Wait);
650652
void updateByEvent(WaitEventType E, MachineInstr &MI);
651653

@@ -1287,40 +1289,35 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
12871289
}
12881290
}
12891291

1290-
void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
1291-
// On entry to a block with multiple predescessors, there may
1292-
// be pending SMEM and VMEM events active at the same time.
1293-
// In such cases, only clear one active event at a time.
1294-
auto applyPendingXcntGroup = [this](unsigned E) {
1295-
unsigned LowerBound = getScoreLB(X_CNT);
1296-
applyWaitcnt(X_CNT, 0);
1297-
PendingEvents |= (1 << E);
1298-
setScoreLB(X_CNT, LowerBound);
1299-
};
1300-
1292+
bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {
13011293
// Wait on XCNT is redundant if we are already waiting for a load to complete.
13021294
// SMEM can return out of order, so only omit XCNT wait if we are waiting till
13031295
// zero.
1304-
if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
1305-
if (hasPendingEvent(VMEM_GROUP))
1306-
applyPendingXcntGroup(VMEM_GROUP);
1307-
else
1308-
applyWaitcnt(X_CNT, 0);
1309-
return;
1310-
}
1296+
return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
1297+
}
13111298

1299+
bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
13121300
// If we have pending store we cannot optimize XCnt because we do not wait for
13131301
// stores. VMEM loads retun in order, so if we only have loads XCnt is
13141302
// decremented to the same number as LOADCnt.
1315-
if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1316-
!hasPendingEvent(STORE_CNT)) {
1317-
if (hasPendingEvent(SMEM_GROUP))
1318-
applyPendingXcntGroup(SMEM_GROUP);
1303+
return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1304+
!hasPendingEvent(STORE_CNT) && !hasPendingEvent(SMEM_GROUP);
1305+
}
1306+
1307+
void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
1308+
if (hasRedundantXCntWithKmCnt(Wait)) {
1309+
if (hasPendingEvent(VMEM_GROUP))
1310+
// Only clear the SMEM_GROUP event, but VMEM_GROUP could still require handling.
1311+
PendingEvents &= ~(1 << SMEM_GROUP);
13191312
else
1320-
applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
1313+
applyWaitcnt(X_CNT, 0);
13211314
return;
13221315
}
1323-
1316+
if (canOptimizeXCntWithLoadCnt(Wait))
1317+
// On entry to a block with multiple predescessors, there may
1318+
// be pending SMEM and VMEM events active at the same time.
1319+
// In such cases, only clear one active event at a time.
1320+
return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
13241321
applyWaitcnt(X_CNT, Wait.XCnt);
13251322
}
13261323

@@ -1656,6 +1653,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
16561653
}
16571654
}
16581655

1656+
// Save the pre combine waitcnt in order to make xcnt checks.
1657+
AMDGPU::Waitcnt PreCombine = Wait;
16591658
if (CombinedLoadDsCntInstr) {
16601659
// Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
16611660
// to be waited for. Otherwise, let the instruction be deleted so
@@ -1746,6 +1745,12 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
17461745
}
17471746

17481747
for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1748+
if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) ||
1749+
(CT == LOAD_CNT &&
1750+
ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine)))
1751+
// Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
1752+
// due to taking the backedge of a block.
1753+
ScoreBrackets.applyXcnt(PreCombine);
17491754
if (!WaitInstrs[CT])
17501755
continue;
17511756

llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2107,7 +2107,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
21072107
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
21082108
; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3
21092109
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
2110-
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
2110+
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
21112111
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
21122112
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
21132113
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
@@ -2126,7 +2126,6 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
21262126
; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3
21272127
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
21282128
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
2129-
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
21302129
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
21312130
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
21322131
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
@@ -2162,7 +2161,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
21622161
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
21632162
; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3
21642163
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
2165-
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
2164+
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
21662165
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
21672166
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
21682167
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
@@ -2183,7 +2182,6 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
21832182
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
21842183
; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3
21852184
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
2186-
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
21872185
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
21882186
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
21892187
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo

llvm/test/CodeGen/AMDGPU/fmin3.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,7 +1233,6 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
12331233
; GFX1250-NEXT: s_wait_loadcnt 0x0
12341234
; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
12351235
; GFX1250-NEXT: s_wait_loadcnt 0x0
1236-
; GFX1250-NEXT: s_wait_xcnt 0x1
12371236
; GFX1250-NEXT: s_mov_b32 s4, s14
12381237
; GFX1250-NEXT: s_mov_b32 s5, s15
12391238
; GFX1250-NEXT: s_mov_b32 s0, s8
@@ -1443,7 +1442,6 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
14431442
; GFX1250-NEXT: s_wait_loadcnt 0x0
14441443
; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
14451444
; GFX1250-NEXT: s_wait_loadcnt 0x0
1446-
; GFX1250-NEXT: s_wait_xcnt 0x1
14471445
; GFX1250-NEXT: s_mov_b32 s4, s14
14481446
; GFX1250-NEXT: s_mov_b32 s5, s15
14491447
; GFX1250-NEXT: s_mov_b32 s0, s8

0 commit comments

Comments
 (0)