@@ -338,8 +338,8 @@ class WaitcntBrackets {
338338 const MachineOperand &Op) const ;
339339
340340 bool counterOutOfOrder (InstCounterType T) const ;
341- void simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const ;
342- void simplifyWaitcnt (InstCounterType T, unsigned &Count) const ;
341+ void simplifyWaitcnt (AMDGPU::Waitcnt &Wait, bool OptNone ) const ;
342+ void simplifyWaitcnt (InstCounterType T, unsigned &Count, bool OptNone ) const ;
343343
344344 void determineWait (InstCounterType T, RegInterval Interval,
345345 AMDGPU::Waitcnt &Wait) const ;
@@ -1164,22 +1164,33 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
11641164
11651165// / Simplify the waitcnt, in the sense of removing redundant counts, and return
11661166// / whether a waitcnt instruction is needed at all.
1167- void WaitcntBrackets::simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const {
1168- simplifyWaitcnt (LOAD_CNT, Wait.LoadCnt );
1169- simplifyWaitcnt (EXP_CNT, Wait.ExpCnt );
1170- simplifyWaitcnt (DS_CNT, Wait.DsCnt );
1171- simplifyWaitcnt (STORE_CNT, Wait.StoreCnt );
1172- simplifyWaitcnt (SAMPLE_CNT, Wait.SampleCnt );
1173- simplifyWaitcnt (BVH_CNT, Wait.BvhCnt );
1174- simplifyWaitcnt (KM_CNT, Wait.KmCnt );
1175- simplifyWaitcnt (X_CNT, Wait.XCnt );
1167+ void WaitcntBrackets::simplifyWaitcnt (AMDGPU::Waitcnt &Wait,
1168+ bool OptNone) const {
1169+ simplifyWaitcnt (LOAD_CNT, Wait.LoadCnt , OptNone);
1170+ simplifyWaitcnt (EXP_CNT, Wait.ExpCnt , OptNone);
1171+ simplifyWaitcnt (DS_CNT, Wait.DsCnt , OptNone);
1172+ simplifyWaitcnt (STORE_CNT, Wait.StoreCnt , OptNone);
1173+ simplifyWaitcnt (SAMPLE_CNT, Wait.SampleCnt , OptNone);
1174+ simplifyWaitcnt (BVH_CNT, Wait.BvhCnt , OptNone);
1175+ simplifyWaitcnt (KM_CNT, Wait.KmCnt , OptNone);
1176+ simplifyWaitcnt (X_CNT, Wait.XCnt , OptNone);
11761177}
11771178
1178- void WaitcntBrackets::simplifyWaitcnt (InstCounterType T,
1179- unsigned &Count ) const {
1179+ void WaitcntBrackets::simplifyWaitcnt (InstCounterType T, unsigned &Count,
1180+ bool OptNone ) const {
11801181 // The number of outstanding events for this type, T, can be calculated
11811182 // as (UB - LB). If the current Count is greater than or equal to the number
11821183 // of outstanding events, then the wait for this counter is redundant.
1184+ //
1185+ // For counts that are at max value or above, try this even when optimizations
1186+ // are disabled. This helps remove max waitcnt's that are inserted by the
1187+ // memory legalizer by default, but does not optimize actual waitcnt's that
1188+ // are otherwise inserted by the memory legalizer or a previous pass of the
1189+ // inserter. The corner case is when a max waitcnt was optimized away although
1190+ // it was not just a default, but was deliberately chosen. This only
1191+ // marginally affects the usefulness of OptNone.
1192+ if (Count < getWaitCountMax (T) && OptNone)
1193+ return ;
11831194 if (Count >= getScoreRange (T))
11841195 Count = ~0u ;
11851196}
@@ -1363,19 +1374,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
13631374 }
13641375
13651376 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode (II.getOpcode ());
1366- bool TrySimplify = Opcode != II.getOpcode () && !OptNone ;
1377+ bool OpcodeIsSoft = Opcode != II.getOpcode ();
13671378
13681379 // Update required wait count. If this is a soft waitcnt (= it was added
13691380 // by an earlier pass), it may be entirely removed.
13701381 if (Opcode == AMDGPU::S_WAITCNT) {
13711382 unsigned IEnc = II.getOperand (0 ).getImm ();
13721383 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt (IV, IEnc);
1373- if (TrySimplify )
1374- ScoreBrackets.simplifyWaitcnt (OldWait);
1384+ if (OpcodeIsSoft )
1385+ ScoreBrackets.simplifyWaitcnt (OldWait, OptNone );
13751386 Wait = Wait.combined (OldWait);
13761387
13771388 // Merge consecutive waitcnt of the same type by erasing multiples.
1378- if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt () && TrySimplify)) {
1389+ if (WaitcntInstr ||
1390+ (!Wait.hasWaitExceptStoreCnt () && OpcodeIsSoft && !OptNone)) {
13791391 II.eraseFromParent ();
13801392 Modified = true ;
13811393 } else
@@ -1386,11 +1398,13 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
13861398
13871399 unsigned OldVSCnt =
13881400 TII->getNamedOperand (II, AMDGPU::OpName::simm16)->getImm ();
1389- if (TrySimplify)
1390- ScoreBrackets.simplifyWaitcnt (InstCounterType::STORE_CNT, OldVSCnt);
1401+ if (OpcodeIsSoft)
1402+ ScoreBrackets.simplifyWaitcnt (InstCounterType::STORE_CNT, OldVSCnt,
1403+ OptNone);
13911404 Wait.StoreCnt = std::min (Wait.StoreCnt , OldVSCnt);
13921405
1393- if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt () && TrySimplify)) {
1406+ if (WaitcntVsCntInstr ||
1407+ (!Wait.hasWaitStoreCnt () && OpcodeIsSoft && !OptNone)) {
13941408 II.eraseFromParent ();
13951409 Modified = true ;
13961410 } else
@@ -1528,7 +1542,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
15281542 // by an earlier pass), it may be entirely removed.
15291543
15301544 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode (II.getOpcode ());
1531- bool TrySimplify = Opcode != II.getOpcode () && !OptNone ;
1545+ bool OpcodeIsSoft = Opcode != II.getOpcode ();
15321546
15331547 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
15341548 // attempt to do more than that either.
@@ -1539,25 +1553,25 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
15391553 unsigned OldEnc =
15401554 TII->getNamedOperand (II, AMDGPU::OpName::simm16)->getImm ();
15411555 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt (IV, OldEnc);
1542- if (TrySimplify )
1543- ScoreBrackets.simplifyWaitcnt (OldWait);
1556+ if (OpcodeIsSoft )
1557+ ScoreBrackets.simplifyWaitcnt (OldWait, OptNone );
15441558 Wait = Wait.combined (OldWait);
15451559 UpdatableInstr = &CombinedLoadDsCntInstr;
15461560 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
15471561 unsigned OldEnc =
15481562 TII->getNamedOperand (II, AMDGPU::OpName::simm16)->getImm ();
15491563 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt (IV, OldEnc);
1550- if (TrySimplify )
1551- ScoreBrackets.simplifyWaitcnt (OldWait);
1564+ if (OpcodeIsSoft )
1565+ ScoreBrackets.simplifyWaitcnt (OldWait, OptNone );
15521566 Wait = Wait.combined (OldWait);
15531567 UpdatableInstr = &CombinedStoreDsCntInstr;
15541568 } else {
15551569 std::optional<InstCounterType> CT = counterTypeForInstr (Opcode);
15561570 assert (CT.has_value ());
15571571 unsigned OldCnt =
15581572 TII->getNamedOperand (II, AMDGPU::OpName::simm16)->getImm ();
1559- if (TrySimplify )
1560- ScoreBrackets.simplifyWaitcnt (CT.value (), OldCnt);
1573+ if (OpcodeIsSoft )
1574+ ScoreBrackets.simplifyWaitcnt (CT.value (), OldCnt, OptNone );
15611575 addWait (Wait, CT.value (), OldCnt);
15621576 UpdatableInstr = &WaitInstrs[CT.value ()];
15631577 }
@@ -2009,7 +2023,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
20092023 }
20102024
20112025 // Verify that the wait is actually needed.
2012- ScoreBrackets.simplifyWaitcnt (Wait);
2026+ ScoreBrackets.simplifyWaitcnt (Wait, /* OptNone = */ false );
20132027
20142028 // When forcing emit, we need to skip terminators because that would break the
20152029 // terminators of the MBB if we emit a waitcnt between terminators.
@@ -2238,7 +2252,7 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
22382252 NeedsEndPGMCheck = true ;
22392253 }
22402254
2241- ScoreBrackets.simplifyWaitcnt (Wait);
2255+ ScoreBrackets.simplifyWaitcnt (Wait, /* OptNone = */ false );
22422256
22432257 auto SuccessorIt = std::next (Inst.getIterator ());
22442258 bool Result = generateWaitcnt (Wait, SuccessorIt, Block, ScoreBrackets,
0 commit comments