@@ -1074,6 +1074,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
10741074 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
10751075 bool IsCrossAddrSpaceOrdering, Position Pos,
10761076 AtomicOrdering Order) const {
1077+ bool Changed = false ;
1078+
10771079 MachineBasicBlock &MBB = *MI->getParent ();
10781080 DebugLoc DL = MI->getDebugLoc ();
10791081
@@ -1147,19 +1149,25 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
11471149 }
11481150 }
11491151
1150- // Always emit a soft wait count, even if it is trivially ~0. SIInsertWaitcnts
1151- // will later use this marker to add additional waits such as those required
1152+ // Always emit a soft wait count at a release , even if it is trivially ~0.
1153+ // SIInsertWaitcnts will later add additional waits such as those required
11521154 // from direct load to LDS (formerly known as LDS DMA).
1153- unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt (
1154- IV, VMCnt ? 0 : getVmcntBitMask (IV), getExpcntBitMask (IV),
1155- LGKMCnt ? 0 : getLgkmcntBitMask (IV));
1156- BuildMI (MBB, MI, DL, TII->get (AMDGPU::S_WAITCNT_soft))
1157- .addImm (WaitCntImmediate);
1155+ if (VMCnt || LGKMCnt ||
1156+ (isReleaseOrStronger (Order) && Scope >= SIAtomicScope::WORKGROUP)) {
1157+ unsigned WaitCntImmediate =
1158+ AMDGPU::encodeWaitcnt (IV,
1159+ VMCnt ? 0 : getVmcntBitMask (IV),
1160+ getExpcntBitMask (IV),
1161+ LGKMCnt ? 0 : getLgkmcntBitMask (IV));
1162+ BuildMI (MBB, MI, DL, TII->get (AMDGPU::S_WAITCNT_soft))
1163+ .addImm (WaitCntImmediate);
1164+ Changed = true ;
1165+ }
11581166
11591167 if (Pos == Position::AFTER)
11601168 --MI;
11611169
1162- return true ;
1170+ return Changed ;
11631171}
11641172
11651173bool SIGfx6CacheControl::insertAcquire (MachineBasicBlock::iterator &MI,
@@ -1962,6 +1970,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
19621970 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
19631971 bool IsCrossAddrSpaceOrdering,
19641972 Position Pos, AtomicOrdering Order) const {
1973+ bool Changed = false ;
1974+
19651975 MachineBasicBlock &MBB = *MI->getParent ();
19661976 DebugLoc DL = MI->getDebugLoc ();
19671977
@@ -2051,25 +2061,32 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
20512061 }
20522062 }
20532063
2054- // Always emit a soft wait count, even if it is trivially ~0. SIInsertWaitcnts
2055- // will later use this marker to add additional waits such as those required
2064+ // Always emit a soft wait count at a release , even if it is trivially ~0.
2065+ // SIInsertWaitcnts will later add additional waits such as those required
20562066 // from direct load to LDS (formerly known as LDS DMA).
2057- unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt (
2058- IV, VMCnt ? 0 : getVmcntBitMask (IV), getExpcntBitMask (IV),
2059- LGKMCnt ? 0 : getLgkmcntBitMask (IV));
2060- BuildMI (MBB, MI, DL, TII->get (AMDGPU::S_WAITCNT_soft))
2061- .addImm (WaitCntImmediate);
2067+ if (VMCnt || LGKMCnt ||
2068+ (isReleaseOrStronger (Order) && Scope >= SIAtomicScope::WORKGROUP)) {
2069+ unsigned WaitCntImmediate =
2070+ AMDGPU::encodeWaitcnt (IV,
2071+ VMCnt ? 0 : getVmcntBitMask (IV),
2072+ getExpcntBitMask (IV),
2073+ LGKMCnt ? 0 : getLgkmcntBitMask (IV));
2074+ BuildMI (MBB, MI, DL, TII->get (AMDGPU::S_WAITCNT_soft))
2075+ .addImm (WaitCntImmediate);
2076+ Changed = true ;
2077+ }
20622078
20632079 if (VSCnt) {
20642080 BuildMI (MBB, MI, DL, TII->get (AMDGPU::S_WAITCNT_VSCNT_soft))
20652081 .addReg (AMDGPU::SGPR_NULL, RegState::Undef)
20662082 .addImm (0 );
2083+ Changed = true ;
20672084 }
20682085
20692086 if (Pos == Position::AFTER)
20702087 --MI;
20712088
2072- return true ;
2089+ return Changed ;
20732090}
20742091
20752092bool SIGfx10CacheControl::insertAcquire (MachineBasicBlock::iterator &MI,
@@ -2278,6 +2295,8 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
22782295 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
22792296 bool IsCrossAddrSpaceOrdering,
22802297 Position Pos, AtomicOrdering Order) const {
2298+ bool Changed = false ;
2299+
22812300 MachineBasicBlock &MBB = *MI->getParent ();
22822301 DebugLoc DL = MI->getDebugLoc ();
22832302
@@ -2361,26 +2380,30 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
23612380 BuildMI (MBB, MI, DL, TII->get (AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm (0 );
23622381 }
23632382 BuildMI (MBB, MI, DL, TII->get (AMDGPU::S_WAIT_LOADCNT_soft)).addImm (0 );
2364- } else {
2365- // Always emit a soft wait count, even if it is trivially ~0.
2366- // SIInsertWaitcnts will later use this marker to add additional waits such
2367- // as those required from direct load to LDS (formerly known as LDS DMA).
2383+ Changed = true ;
2384+ } else if (isReleaseOrStronger (Order) && Scope >= SIAtomicScope::WORKGROUP) {
2385+ // Always emit a soft wait count at a release, even if it is trivially ~0.
2386+ // SIInsertWaitcnts will later add additional waits such as those required
2387+ // from direct load to LDS (formerly known as LDS DMA).
23682388 BuildMI (MBB, MI, DL, TII->get (AMDGPU::S_WAIT_LOADCNT_soft))
23692389 .addImm (getLoadcntBitMask (IV));
2390+ Changed = true ;
23702391 }
23712392
23722393 if (STORECnt) {
23732394 BuildMI (MBB, MI, DL, TII->get (AMDGPU::S_WAIT_STORECNT_soft)).addImm (0 );
2395+ Changed = true ;
23742396 }
23752397
23762398 if (DSCnt) {
23772399 BuildMI (MBB, MI, DL, TII->get (AMDGPU::S_WAIT_DSCNT_soft)).addImm (0 );
2400+ Changed = true ;
23782401 }
23792402
23802403 if (Pos == Position::AFTER)
23812404 --MI;
23822405
2383- return true ;
2406+ return Changed ;
23842407}
23852408
23862409bool SIGfx12CacheControl::insertAcquire (MachineBasicBlock::iterator &MI,
0 commit comments