@@ -633,8 +633,11 @@ class WaitcntBrackets {
633633 const MachineOperand &Op) const ;
634634
635635 bool counterOutOfOrder (InstCounterType T) const ;
636- void simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const ;
636+ void simplifyWaitcnt (AMDGPU::Waitcnt &Wait);
637637 void simplifyWaitcnt (InstCounterType T, unsigned &Count) const ;
638+ bool hasRedundantXCntWithKmCnt (const AMDGPU::Waitcnt &Wait);
639+ bool canOptimizeXCntWithLoadCnt (const AMDGPU::Waitcnt &Wait);
640+ void simplifyXcnt (AMDGPU::Waitcnt &Wait);
638641
639642 void determineWait (InstCounterType T, RegInterval Interval,
640643 AMDGPU::Waitcnt &Wait) const ;
@@ -646,9 +649,6 @@ class WaitcntBrackets {
646649
647650 void applyWaitcnt (const AMDGPU::Waitcnt &Wait);
648651 void applyWaitcnt (InstCounterType T, unsigned Count);
649- bool hasRedundantXCntWithKmCnt (const AMDGPU::Waitcnt &Wait);
650- bool canOptimizeXCntWithLoadCnt (const AMDGPU::Waitcnt &Wait);
651- void applyXcnt (const AMDGPU::Waitcnt &Wait);
652652 void updateByEvent (WaitEventType E, MachineInstr &MI);
653653
654654 unsigned hasPendingEvent () const { return PendingEvents; }
@@ -1194,15 +1194,15 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
11941194
11951195// / Simplify the waitcnt, in the sense of removing redundant counts, and return
11961196// / whether a waitcnt instruction is needed at all.
1197- void WaitcntBrackets::simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const {
1197+ void WaitcntBrackets::simplifyWaitcnt (AMDGPU::Waitcnt &Wait) {
11981198 simplifyWaitcnt (LOAD_CNT, Wait.LoadCnt );
11991199 simplifyWaitcnt (EXP_CNT, Wait.ExpCnt );
12001200 simplifyWaitcnt (DS_CNT, Wait.DsCnt );
12011201 simplifyWaitcnt (STORE_CNT, Wait.StoreCnt );
12021202 simplifyWaitcnt (SAMPLE_CNT, Wait.SampleCnt );
12031203 simplifyWaitcnt (BVH_CNT, Wait.BvhCnt );
12041204 simplifyWaitcnt (KM_CNT, Wait.KmCnt );
1205- simplifyWaitcnt (X_CNT, Wait. XCnt );
1205+ simplifyXcnt ( Wait);
12061206}
12071207
12081208void WaitcntBrackets::simplifyWaitcnt (InstCounterType T,
@@ -1272,7 +1272,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
12721272 applyWaitcnt (SAMPLE_CNT, Wait.SampleCnt );
12731273 applyWaitcnt (BVH_CNT, Wait.BvhCnt );
12741274 applyWaitcnt (KM_CNT, Wait.KmCnt );
1275- applyXcnt ( Wait);
1275+ applyWaitcnt (X_CNT, Wait. XCnt );
12761276}
12771277
12781278void WaitcntBrackets::applyWaitcnt (InstCounterType T, unsigned Count) {
@@ -1304,7 +1304,11 @@ bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
13041304 !hasPendingEvent (STORE_CNT) && !hasPendingEvent (SMEM_GROUP);
13051305}
13061306
1307- void WaitcntBrackets::applyXcnt (const AMDGPU::Waitcnt &Wait) {
1307+ void WaitcntBrackets::simplifyXcnt (AMDGPU::Waitcnt &Wait) {
1308+ // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1309+ // optimizations. On entry to a block with multiple predescessors, there may
1310+ // be pending SMEM and VMEM events active at the same time.
1311+ // In such cases, only clear one active event at a time.
13081312 if (hasRedundantXCntWithKmCnt (Wait)) {
13091313 if (hasPendingEvent (VMEM_GROUP)) {
13101314 // Only clear the SMEM_GROUP event, but VMEM_GROUP could still require
@@ -1313,15 +1317,10 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
13131317 } else {
13141318 applyWaitcnt (X_CNT, 0 );
13151319 }
1316- return ;
1317- }
1318- if (canOptimizeXCntWithLoadCnt (Wait)) {
1319- // On entry to a block with multiple predescessors, there may
1320- // be pending SMEM and VMEM events active at the same time.
1321- // In such cases, only clear one active event at a time.
1322- return applyWaitcnt (X_CNT, std::min (Wait.XCnt , Wait.LoadCnt ));
1320+ } else if (canOptimizeXCntWithLoadCnt (Wait)) {
1321+ applyWaitcnt (X_CNT, std::min (Wait.XCnt , Wait.LoadCnt ));
13231322 }
1324- applyWaitcnt (X_CNT, Wait.XCnt );
1323+ simplifyWaitcnt (X_CNT, Wait.XCnt );
13251324}
13261325
13271326// Where there are multiple types of event in the bracket of a counter,
@@ -1753,7 +1752,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
17531752 ScoreBrackets.canOptimizeXCntWithLoadCnt (PreCombine))) {
17541753 // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
17551754 // due to taking the backedge of a block.
1756- ScoreBrackets.applyXcnt (PreCombine);
1755+ ScoreBrackets.simplifyXcnt (PreCombine);
17571756 }
17581757 if (!WaitInstrs[CT])
17591758 continue ;
@@ -2169,19 +2168,11 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
21692168 << " Update Instr: " << *It);
21702169 }
21712170
2172- // XCnt may be already consumed by a load wait.
2173- if (Wait.XCnt != ~0u ) {
2174- if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent (SMEM_GROUP))
2175- Wait.XCnt = ~0u ;
2176-
2177- if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent (VMEM_GROUP))
2178- Wait.XCnt = ~0u ;
2179-
2180- // Since the translation for VMEM addresses occur in-order, we can skip the
2181- // XCnt if the current instruction is of VMEM type and has a memory
2182- // dependency with another VMEM instruction in flight.
2183- if (isVmemAccess (*It))
2184- Wait.XCnt = ~0u ;
2171+ // Since the translation for VMEM addresses occur in-order, we can skip the
2172+ // XCnt if the current instruction is of VMEM type and has a memory
2173+ // dependency with another VMEM instruction in flight.
2174+ if (Wait.XCnt != ~0u && isVmemAccess (*It)) {
2175+ Wait.XCnt = ~0u ;
21852176 }
21862177
21872178 if (WCG->createNewWaitcnt (Block, It, Wait))
0 commit comments