@@ -646,6 +646,8 @@ class WaitcntBrackets {
646646
647647 void applyWaitcnt (const AMDGPU::Waitcnt &Wait);
648648 void applyWaitcnt (InstCounterType T, unsigned Count);
649+ bool hasRedundantXCntWithKmCnt (const AMDGPU::Waitcnt &Wait);
650+ bool canOptimizeXCntWithLoadCnt (const AMDGPU::Waitcnt &Wait);
649651 void applyXcnt (const AMDGPU::Waitcnt &Wait);
650652 void updateByEvent (WaitEventType E, MachineInstr &MI);
651653
@@ -1287,40 +1289,35 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
12871289 }
12881290}
12891291
1290- void WaitcntBrackets::applyXcnt (const AMDGPU::Waitcnt &Wait) {
1291- // On entry to a block with multiple predescessors, there may
1292- // be pending SMEM and VMEM events active at the same time.
1293- // In such cases, only clear one active event at a time.
1294- auto applyPendingXcntGroup = [this ](unsigned E) {
1295- unsigned LowerBound = getScoreLB (X_CNT);
1296- applyWaitcnt (X_CNT, 0 );
1297- PendingEvents |= (1 << E);
1298- setScoreLB (X_CNT, LowerBound);
1299- };
1300-
1292+ bool WaitcntBrackets::hasRedundantXCntWithKmCnt (const AMDGPU::Waitcnt &Wait) {
13011293 // Wait on XCNT is redundant if we are already waiting for a load to complete.
13021294 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
13031295 // zero.
1304- if (Wait.KmCnt == 0 && hasPendingEvent (SMEM_GROUP)) {
1305- if (hasPendingEvent (VMEM_GROUP))
1306- applyPendingXcntGroup (VMEM_GROUP);
1307- else
1308- applyWaitcnt (X_CNT, 0 );
1309- return ;
1310- }
1296+ return Wait.KmCnt == 0 && hasPendingEvent (SMEM_GROUP);
1297+ }
13111298
1299+ bool WaitcntBrackets::canOptimizeXCntWithLoadCnt (const AMDGPU::Waitcnt &Wait) {
13121300 // If we have pending store we cannot optimize XCnt because we do not wait for
13131301 // stores. VMEM loads retun in order, so if we only have loads XCnt is
13141302 // decremented to the same number as LOADCnt.
1315- if (Wait.LoadCnt != ~0u && hasPendingEvent (VMEM_GROUP) &&
1316- !hasPendingEvent (STORE_CNT)) {
1317- if (hasPendingEvent (SMEM_GROUP))
1318- applyPendingXcntGroup (SMEM_GROUP);
1303+ return Wait.LoadCnt != ~0u && hasPendingEvent (VMEM_GROUP) &&
1304+ !hasPendingEvent (STORE_CNT) && !hasPendingEvent (SMEM_GROUP);
1305+ }
1306+
1307+ void WaitcntBrackets::applyXcnt (const AMDGPU::Waitcnt &Wait) {
1308+ if (hasRedundantXCntWithKmCnt (Wait)) {
1309+ if (hasPendingEvent (VMEM_GROUP))
1310+ // Only clear the SMEM_GROUP event, but VMEM_GROUP could still require handling.
1311+ PendingEvents &= ~(1 << SMEM_GROUP);
13191312 else
1320- applyWaitcnt (X_CNT, std::min (Wait. XCnt , Wait. LoadCnt ) );
1313+ applyWaitcnt (X_CNT, 0 );
13211314 return ;
13221315 }
1323-
1316+ if (canOptimizeXCntWithLoadCnt (Wait))
1317+ // On entry to a block with multiple predescessors, there may
1318+ // be pending SMEM and VMEM events active at the same time.
1319+ // In such cases, only clear one active event at a time.
1320+ return applyWaitcnt (X_CNT, std::min (Wait.XCnt , Wait.LoadCnt ));
13241321 applyWaitcnt (X_CNT, Wait.XCnt );
13251322}
13261323
@@ -1656,6 +1653,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
16561653 }
16571654 }
16581655
1656+ // Save the pre combine waitcnt in order to make xcnt checks.
1657+ AMDGPU::Waitcnt PreCombine = Wait;
16591658 if (CombinedLoadDsCntInstr) {
16601659 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
16611660 // to be waited for. Otherwise, let the instruction be deleted so
@@ -1746,6 +1745,12 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
17461745 }
17471746
17481747 for (auto CT : inst_counter_types (NUM_EXTENDED_INST_CNTS)) {
1748+ if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt (PreCombine)) ||
1749+ (CT == LOAD_CNT &&
1750+ ScoreBrackets.canOptimizeXCntWithLoadCnt (PreCombine)))
1751+ // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
1752+ // due to taking the backedge of a block.
1753+ ScoreBrackets.applyXcnt (PreCombine);
17491754 if (!WaitInstrs[CT])
17501755 continue ;
17511756
0 commit comments