@@ -646,6 +646,8 @@ class WaitcntBrackets {
646646
647647 void applyWaitcnt (const AMDGPU::Waitcnt &Wait);
648648 void applyWaitcnt (InstCounterType T, unsigned Count);
649+ bool hasRedundantXCntWithKmCnt (const AMDGPU::Waitcnt &Wait);
650+ bool canOptimizeXCntWithLoadCnt (const AMDGPU::Waitcnt &Wait);
649651 void applyXcnt (const AMDGPU::Waitcnt &Wait);
650652 void updateByEvent (WaitEventType E, MachineInstr &MI);
651653
@@ -1287,20 +1289,26 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
12871289 }
12881290}
12891291
1290- void WaitcntBrackets::applyXcnt (const AMDGPU::Waitcnt &Wait) {
1292+ bool WaitcntBrackets::hasRedundantXCntWithKmCnt (const AMDGPU::Waitcnt &Wait) {
12911293 // Wait on XCNT is redundant if we are already waiting for a load to complete.
12921294 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
12931295 // zero.
1294- if ( Wait.KmCnt == 0 && hasPendingEvent (SMEM_GROUP))
1295- return applyWaitcnt (X_CNT, 0 );
1296+ return Wait.KmCnt == 0 && hasPendingEvent (SMEM_GROUP);
1297+ }
12961298
1299+ bool WaitcntBrackets::canOptimizeXCntWithLoadCnt (const AMDGPU::Waitcnt &Wait) {
12971300 // If we have pending store we cannot optimize XCnt because we do not wait for
12981301 // stores. VMEM loads retun in order, so if we only have loads XCnt is
12991302 // decremented to the same number as LOADCnt.
1300- if ( Wait.LoadCnt != ~0u && hasPendingEvent (VMEM_GROUP) &&
1301- !hasPendingEvent (STORE_CNT))
1302- return applyWaitcnt (X_CNT, std::min (Wait. XCnt , Wait. LoadCnt ));
1303+ return Wait.LoadCnt != ~0u && hasPendingEvent (VMEM_GROUP) &&
1304+ !hasPendingEvent (STORE_CNT);
1305+ }
13031306
1307+ void WaitcntBrackets::applyXcnt (const AMDGPU::Waitcnt &Wait) {
1308+ if (hasRedundantXCntWithKmCnt (Wait))
1309+ return applyWaitcnt (X_CNT, 0 );
1310+ if (canOptimizeXCntWithLoadCnt (Wait))
1311+ return applyWaitcnt (X_CNT, std::min (Wait.XCnt , Wait.LoadCnt ));
13041312 applyWaitcnt (X_CNT, Wait.XCnt );
13051313}
13061314
@@ -1636,6 +1644,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
16361644 }
16371645 }
16381646
1647+ // Save the pre combine waitcnt in order to make xcnt checks.
1648+ AMDGPU::Waitcnt PreCombine = Wait;
16391649 if (CombinedLoadDsCntInstr) {
16401650 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
16411651 // to be waited for. Otherwise, let the instruction be deleted so
@@ -1726,6 +1736,12 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
17261736 }
17271737
17281738 for (auto CT : inst_counter_types (NUM_EXTENDED_INST_CNTS)) {
1739+ if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt (PreCombine)) ||
1740+ (CT == LOAD_CNT &&
1741+ ScoreBrackets.canOptimizeXCntWithLoadCnt (PreCombine)))
1742+ // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
1743+ // due to taking the backedge of a block.
1744+ ScoreBrackets.applyXcnt (PreCombine);
17291745 if (!WaitInstrs[CT])
17301746 continue ;
17311747
0 commit comments