@@ -360,11 +360,13 @@ class SICacheControl {
360360 // / between memory instructions to enforce the order they become visible as
361361 // / observed by other memory instructions executing in memory scope \p Scope.
362362 // / \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
363- // / address spaces. Returns true iff any instructions inserted.
363+ // / address spaces. If \p AtomicsOnly is true, only insert waits for counters
364+ // / that are used by atomic instructions.
365+ // / Returns true iff any instructions inserted.
364366 virtual bool insertWait (MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
365367 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
366368 bool IsCrossAddrSpaceOrdering, Position Pos,
367- AtomicOrdering Order) const = 0;
369+ AtomicOrdering Order, bool AtomicsOnly ) const = 0;
368370
369371 // / Inserts any necessary instructions at position \p Pos relative to
370372 // / instruction \p MI to ensure any subsequent memory instructions of this
@@ -437,7 +439,7 @@ class SIGfx6CacheControl : public SICacheControl {
437439 bool insertWait (MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
438440 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
439441 bool IsCrossAddrSpaceOrdering, Position Pos,
440- AtomicOrdering Order) const override ;
442+ AtomicOrdering Order, bool AtomicsOnly ) const override ;
441443
442444 bool insertAcquire (MachineBasicBlock::iterator &MI,
443445 SIAtomicScope Scope,
@@ -484,7 +486,7 @@ class SIGfx90ACacheControl : public SIGfx7CacheControl {
484486 bool insertWait (MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
485487 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
486488 bool IsCrossAddrSpaceOrdering, Position Pos,
487- AtomicOrdering Order) const override ;
489+ AtomicOrdering Order, bool AtomicsOnly ) const override ;
488490
489491 bool insertAcquire (MachineBasicBlock::iterator &MI,
490492 SIAtomicScope Scope,
@@ -572,7 +574,7 @@ class SIGfx10CacheControl : public SIGfx7CacheControl {
572574 bool insertWait (MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
573575 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
574576 bool IsCrossAddrSpaceOrdering, Position Pos,
575- AtomicOrdering Order) const override ;
577+ AtomicOrdering Order, bool AtomicsOnly ) const override ;
576578
577579 bool insertAcquire (MachineBasicBlock::iterator &MI,
578580 SIAtomicScope Scope,
@@ -629,7 +631,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
629631 bool insertWait (MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
630632 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
631633 bool IsCrossAddrSpaceOrdering, Position Pos,
632- AtomicOrdering Order) const override ;
634+ AtomicOrdering Order, bool AtomicsOnly ) const override ;
633635
634636 bool insertAcquire (MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
635637 SIAtomicAddrSpace AddrSpace, Position Pos) const override ;
@@ -1120,7 +1122,8 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
11201122 // observable outside the program, so no need to cause a waitcnt for LDS
11211123 // address space operations.
11221124 Changed |= insertWait (MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false ,
1123- Position::AFTER, AtomicOrdering::Unordered);
1125+ Position::AFTER, AtomicOrdering::Unordered,
1126+ /* AtomicsOnly=*/ false );
11241127
11251128 return Changed;
11261129 }
@@ -1140,7 +1143,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
11401143 SIAtomicScope Scope,
11411144 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
11421145 bool IsCrossAddrSpaceOrdering, Position Pos,
1143- AtomicOrdering Order) const {
1146+ AtomicOrdering Order,
1147+ bool AtomicsOnly) const {
11441148 bool Changed = false ;
11451149
11461150 MachineBasicBlock &MBB = *MI->getParent ();
@@ -1294,7 +1298,8 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
12941298 bool IsCrossAddrSpaceOrdering,
12951299 Position Pos) const {
12961300 return insertWait (MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1297- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1301+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
1302+ /* AtomicsOnly=*/ false );
12981303}
12991304
13001305bool SIGfx7CacheControl::insertAcquire (MachineBasicBlock::iterator &MI,
@@ -1447,7 +1452,8 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
14471452 // observable outside the program, so no need to cause a waitcnt for LDS
14481453 // address space operations.
14491454 Changed |= insertWait (MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false ,
1450- Position::AFTER, AtomicOrdering::Unordered);
1455+ Position::AFTER, AtomicOrdering::Unordered,
1456+ /* AtomicsOnly=*/ false );
14511457
14521458 return Changed;
14531459 }
@@ -1467,8 +1473,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
14671473 SIAtomicScope Scope,
14681474 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
14691475 bool IsCrossAddrSpaceOrdering,
1470- Position Pos,
1471- AtomicOrdering Order ) const {
1476+ Position Pos, AtomicOrdering Order,
1477+ bool AtomicsOnly ) const {
14721478 if (ST.isTgSplitEnabled ()) {
14731479 // In threadgroup split mode the waves of a work-group can be executing on
14741480 // different CUs. Therefore need to wait for global or GDS memory operations
@@ -1488,7 +1494,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
14881494 AddrSpace &= ~SIAtomicAddrSpace::LDS;
14891495 }
14901496 return SIGfx7CacheControl::insertWait (MI, Scope, AddrSpace, Op,
1491- IsCrossAddrSpaceOrdering, Pos, Order);
1497+ IsCrossAddrSpaceOrdering, Pos, Order,
1498+ AtomicsOnly);
14921499}
14931500
14941501bool SIGfx90ACacheControl::insertAcquire (MachineBasicBlock::iterator &MI,
@@ -1747,7 +1754,8 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
17471754 // observable outside the program, so no need to cause a waitcnt for LDS
17481755 // address space operations.
17491756 Changed |= insertWait (MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false ,
1750- Position::AFTER, AtomicOrdering::Unordered);
1757+ Position::AFTER, AtomicOrdering::Unordered,
1758+ /* AtomicsOnly=*/ false );
17511759
17521760 return Changed;
17531761 }
@@ -1904,7 +1912,8 @@ bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
19041912 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
19051913 // S_WAITCNT needed.
19061914 Changed |= insertWait (MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1907- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1915+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
1916+ /* AtomicsOnly=*/ false );
19081917
19091918 return Changed;
19101919}
@@ -1984,7 +1993,8 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
19841993 // observable outside the program, so no need to cause a waitcnt for LDS
19851994 // address space operations.
19861995 Changed |= insertWait (MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false ,
1987- Position::AFTER, AtomicOrdering::Unordered);
1996+ Position::AFTER, AtomicOrdering::Unordered,
1997+ /* AtomicsOnly=*/ false );
19881998 return Changed;
19891999 }
19902000
@@ -2007,7 +2017,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
20072017 SIAtomicScope Scope,
20082018 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
20092019 bool IsCrossAddrSpaceOrdering,
2010- Position Pos, AtomicOrdering Order) const {
2020+ Position Pos, AtomicOrdering Order,
2021+ bool AtomicsOnly) const {
20112022 bool Changed = false ;
20122023
20132024 MachineBasicBlock &MBB = *MI->getParent ();
@@ -2281,7 +2292,8 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
22812292 // observable outside the program, so no need to cause a waitcnt for LDS
22822293 // address space operations.
22832294 Changed |= insertWait (MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false ,
2284- Position::AFTER, AtomicOrdering::Unordered);
2295+ Position::AFTER, AtomicOrdering::Unordered,
2296+ /* AtomicsOnly=*/ false );
22852297 return Changed;
22862298 }
22872299
@@ -2354,7 +2366,8 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
23542366 SIAtomicScope Scope,
23552367 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
23562368 bool IsCrossAddrSpaceOrdering,
2357- Position Pos, AtomicOrdering Order) const {
2369+ Position Pos, AtomicOrdering Order,
2370+ bool AtomicsOnly) const {
23582371 bool Changed = false ;
23592372
23602373 MachineBasicBlock &MBB = *MI->getParent ();
@@ -2444,7 +2457,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
24442457 //
24452458 // This also applies to fences. Fences cannot pair with an instruction
24462459 // tracked with bvh/samplecnt as we don't have any atomics that do that.
2447- if (Order != AtomicOrdering::Acquire && ST.hasImageInsts ()) {
2460+ if (!AtomicsOnly && ST.hasImageInsts ()) {
24482461 BuildMI (MBB, MI, DL, TII->get (AMDGPU::S_WAIT_BVHCNT_soft)).addImm (0 );
24492462 BuildMI (MBB, MI, DL, TII->get (AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm (0 );
24502463 }
@@ -2587,7 +2600,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
25872600 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
25882601 // we of course need to wait for that as well.
25892602 Changed |= insertWait (MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2590- IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
2603+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
2604+ /* AtomicsOnly=*/ false );
25912605
25922606 return Changed;
25932607}
@@ -2624,7 +2638,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
26242638 // observable outside the program, so no need to cause a waitcnt for LDS
26252639 // address space operations.
26262640 Changed |= insertWait (MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false ,
2627- Position::AFTER, AtomicOrdering::Unordered);
2641+ Position::AFTER, AtomicOrdering::Unordered,
2642+ /* AtomicsOnly=*/ false );
26282643 }
26292644
26302645 return Changed;
@@ -2748,13 +2763,15 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
27482763 Changed |= CC->insertWait (MI, MOI.getScope (), MOI.getOrderingAddrSpace (),
27492764 SIMemOp::LOAD | SIMemOp::STORE,
27502765 MOI.getIsCrossAddressSpaceOrdering (),
2751- Position::BEFORE, Order);
2766+ Position::BEFORE, Order, /* AtomicsOnly= */ false );
27522767
27532768 if (Order == AtomicOrdering::Acquire ||
27542769 Order == AtomicOrdering::SequentiallyConsistent) {
2755- Changed |= CC->insertWait (
2756- MI, MOI.getScope (), MOI.getInstrAddrSpace (), SIMemOp::LOAD,
2757- MOI.getIsCrossAddressSpaceOrdering (), Position::AFTER, Order);
2770+ // The wait below only needs to wait on the prior atomic.
2771+ Changed |=
2772+ CC->insertWait (MI, MOI.getScope (), MOI.getInstrAddrSpace (),
2773+ SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering (),
2774+ Position::AFTER, Order, /* AtomicsOnly=*/ true );
27582775 Changed |= CC->insertAcquire (MI, MOI.getScope (),
27592776 MOI.getOrderingAddrSpace (),
27602777 Position::AFTER);
@@ -2830,9 +2847,11 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
28302847 if (MOI.isAtomic ()) {
28312848 const AtomicOrdering Order = MOI.getOrdering ();
28322849 if (Order == AtomicOrdering::Acquire) {
2833- Changed |= CC->insertWait (
2834- MI, MOI.getScope (), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2835- MOI.getIsCrossAddressSpaceOrdering (), Position::BEFORE, Order);
2850+ // Acquire fences only need to wait on the previous atomic they pair with.
2851+ Changed |= CC->insertWait (MI, MOI.getScope (), OrderingAddrSpace,
2852+ SIMemOp::LOAD | SIMemOp::STORE,
2853+ MOI.getIsCrossAddressSpaceOrdering (),
2854+ Position::BEFORE, Order, /* AtomicsOnly=*/ true );
28362855 }
28372856
28382857 if (Order == AtomicOrdering::Release ||
@@ -2897,10 +2916,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
28972916 Order == AtomicOrdering::SequentiallyConsistent ||
28982917 MOI.getFailureOrdering () == AtomicOrdering::Acquire ||
28992918 MOI.getFailureOrdering () == AtomicOrdering::SequentiallyConsistent) {
2900- Changed |= CC->insertWait (
2901- MI, MOI.getScope (), MOI.getInstrAddrSpace (),
2902- isAtomicRet (*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2903- MOI.getIsCrossAddressSpaceOrdering (), Position::AFTER, Order);
2919+ // Only wait on the previous atomic.
2920+ Changed |=
2921+ CC->insertWait (MI, MOI.getScope (), MOI.getInstrAddrSpace (),
2922+ isAtomicRet (*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2923+ MOI.getIsCrossAddressSpaceOrdering (), Position::AFTER,
2924+ Order, /* AtomicsOnly=*/ true );
29042925 Changed |= CC->insertAcquire (MI, MOI.getScope (),
29052926 MOI.getOrderingAddrSpace (),
29062927 Position::AFTER);
0 commit comments