@@ -637,7 +637,7 @@ class WaitcntBrackets {
637637 void simplifyWaitcnt (InstCounterType T, unsigned &Count) const ;
638638 bool hasRedundantXCntWithKmCnt (const AMDGPU::Waitcnt &Wait);
639639 bool canOptimizeXCntWithLoadCnt (const AMDGPU::Waitcnt &Wait);
640- void simplifyXcnt (AMDGPU::Waitcnt &Wait );
640+ void simplifyXcnt (AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait );
641641
642642 void determineWait (InstCounterType T, RegInterval Interval,
643643 AMDGPU::Waitcnt &Wait) const ;
@@ -1202,7 +1202,7 @@ void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
12021202 simplifyWaitcnt (SAMPLE_CNT, Wait.SampleCnt );
12031203 simplifyWaitcnt (BVH_CNT, Wait.BvhCnt );
12041204 simplifyWaitcnt (KM_CNT, Wait.KmCnt );
1205- simplifyXcnt (Wait);
1205+ simplifyXcnt (Wait, Wait );
12061206}
12071207
12081208void WaitcntBrackets::simplifyWaitcnt (InstCounterType T,
@@ -1304,23 +1304,23 @@ bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
13041304 !hasPendingEvent (STORE_CNT) && !hasPendingEvent (SMEM_GROUP);
13051305}
13061306
1307- void WaitcntBrackets::simplifyXcnt (AMDGPU::Waitcnt &Wait ) {
1307+ void WaitcntBrackets::simplifyXcnt (AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait ) {
13081308 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
13091309 // optimizations. On entry to a block with multiple predescessors, there may
13101310 // be pending SMEM and VMEM events active at the same time.
13111311 // In such cases, only clear one active event at a time.
1312- if (hasRedundantXCntWithKmCnt (Wait )) {
1312+ if (hasRedundantXCntWithKmCnt (CheckWait )) {
13131313 if (hasPendingEvent (VMEM_GROUP)) {
13141314 // Only clear the SMEM_GROUP event, but VMEM_GROUP could still require
13151315 // handling.
13161316 PendingEvents &= ~(1 << SMEM_GROUP);
13171317 } else {
13181318 applyWaitcnt (X_CNT, 0 );
13191319 }
1320- } else if (canOptimizeXCntWithLoadCnt (Wait )) {
1321- applyWaitcnt (X_CNT, std::min (Wait .XCnt , Wait .LoadCnt ));
1320+ } else if (canOptimizeXCntWithLoadCnt (CheckWait )) {
1321+ applyWaitcnt (X_CNT, std::min (CheckWait .XCnt , CheckWait .LoadCnt ));
13221322 }
1323- simplifyWaitcnt (X_CNT, Wait .XCnt );
1323+ simplifyWaitcnt (X_CNT, UpdateWait .XCnt );
13241324}
13251325
13261326// Where there are multiple types of event in the bracket of a counter,
@@ -1752,7 +1752,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
17521752 ScoreBrackets.canOptimizeXCntWithLoadCnt (PreCombine))) {
17531753 // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
17541754 // due to taking the backedge of a block.
1755- ScoreBrackets.simplifyXcnt (PreCombine);
1755+ ScoreBrackets.simplifyXcnt (PreCombine, Wait );
17561756 }
17571757 if (!WaitInstrs[CT])
17581758 continue ;
@@ -2100,6 +2100,14 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
21002100 // Verify that the wait is actually needed.
21012101 ScoreBrackets.simplifyWaitcnt (Wait);
21022102
2103+ // Since the translation for VMEM addresses occur in-order, we can apply the
2104+ // XCnt if the current instruction is of VMEM type and has a memory
2105+ // dependency with another VMEM instruction in flight.
2106+ if (Wait.XCnt != ~0u && isVmemAccess (MI)) {
2107+ ScoreBrackets.applyWaitcnt (X_CNT, Wait.XCnt );
2108+ Wait.XCnt = ~0u ;
2109+ }
2110+
21032111 // When forcing emit, we need to skip terminators because that would break the
21042112 // terminators of the MBB if we emit a waitcnt between terminators.
21052113 if (ForceEmitZeroFlag && !MI.isTerminator ())
@@ -2168,13 +2176,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
21682176 << " Update Instr: " << *It);
21692177 }
21702178
2171- // Since the translation for VMEM addresses occur in-order, we can skip the
2172- // XCnt if the current instruction is of VMEM type and has a memory
2173- // dependency with another VMEM instruction in flight.
2174- if (Wait.XCnt != ~0u && isVmemAccess (*It)) {
2175- Wait.XCnt = ~0u ;
2176- }
2177-
21782179 if (WCG->createNewWaitcnt (Block, It, Wait))
21792180 Modified = true ;
21802181
0 commit comments