Skip to content

Commit 9e16c25

Browse files
Pierre-vhLukacma
authored andcommitted
[AMDGPU][SIMemoryLegalizer][GFX12] Correctly insert sample/bvhcnt (llvm#161637)
The check used was not strong enough to prevent the insertion of sample/bvhcnt when they were not needed. I assume SIInsertWaitCnts was trimming those away anyway, but this was a bug nonetheless. We were inserting SAMPLE/BVHcnt waits in places where we only needed to wait on the previous atomic operation. Neither of these counter have any atomics associated with them.
1 parent 088d66e commit 9e16c25

9 files changed

+54
-761
lines changed

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 54 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -360,11 +360,13 @@ class SICacheControl {
360360
/// between memory instructions to enforce the order they become visible as
361361
/// observed by other memory instructions executing in memory scope \p Scope.
362362
/// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
363-
/// address spaces. Returns true iff any instructions inserted.
363+
/// address spaces. If \p AtomicsOnly is true, only insert waits for counters
364+
/// that are used by atomic instructions.
365+
/// Returns true iff any instructions inserted.
364366
virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
365367
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
366368
bool IsCrossAddrSpaceOrdering, Position Pos,
367-
AtomicOrdering Order) const = 0;
369+
AtomicOrdering Order, bool AtomicsOnly) const = 0;
368370

369371
/// Inserts any necessary instructions at position \p Pos relative to
370372
/// instruction \p MI to ensure any subsequent memory instructions of this
@@ -437,7 +439,7 @@ class SIGfx6CacheControl : public SICacheControl {
437439
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
438440
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
439441
bool IsCrossAddrSpaceOrdering, Position Pos,
440-
AtomicOrdering Order) const override;
442+
AtomicOrdering Order, bool AtomicsOnly) const override;
441443

442444
bool insertAcquire(MachineBasicBlock::iterator &MI,
443445
SIAtomicScope Scope,
@@ -484,7 +486,7 @@ class SIGfx90ACacheControl : public SIGfx7CacheControl {
484486
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
485487
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
486488
bool IsCrossAddrSpaceOrdering, Position Pos,
487-
AtomicOrdering Order) const override;
489+
AtomicOrdering Order, bool AtomicsOnly) const override;
488490

489491
bool insertAcquire(MachineBasicBlock::iterator &MI,
490492
SIAtomicScope Scope,
@@ -572,7 +574,7 @@ class SIGfx10CacheControl : public SIGfx7CacheControl {
572574
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
573575
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
574576
bool IsCrossAddrSpaceOrdering, Position Pos,
575-
AtomicOrdering Order) const override;
577+
AtomicOrdering Order, bool AtomicsOnly) const override;
576578

577579
bool insertAcquire(MachineBasicBlock::iterator &MI,
578580
SIAtomicScope Scope,
@@ -629,7 +631,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
629631
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
630632
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
631633
bool IsCrossAddrSpaceOrdering, Position Pos,
632-
AtomicOrdering Order) const override;
634+
AtomicOrdering Order, bool AtomicsOnly) const override;
633635

634636
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
635637
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
@@ -1120,7 +1122,8 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
11201122
// observable outside the program, so no need to cause a waitcnt for LDS
11211123
// address space operations.
11221124
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1123-
Position::AFTER, AtomicOrdering::Unordered);
1125+
Position::AFTER, AtomicOrdering::Unordered,
1126+
/*AtomicsOnly=*/false);
11241127

11251128
return Changed;
11261129
}
@@ -1140,7 +1143,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
11401143
SIAtomicScope Scope,
11411144
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
11421145
bool IsCrossAddrSpaceOrdering, Position Pos,
1143-
AtomicOrdering Order) const {
1146+
AtomicOrdering Order,
1147+
bool AtomicsOnly) const {
11441148
bool Changed = false;
11451149

11461150
MachineBasicBlock &MBB = *MI->getParent();
@@ -1294,7 +1298,8 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
12941298
bool IsCrossAddrSpaceOrdering,
12951299
Position Pos) const {
12961300
return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1297-
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1301+
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
1302+
/*AtomicsOnly=*/false);
12981303
}
12991304

13001305
bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -1447,7 +1452,8 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
14471452
// observable outside the program, so no need to cause a waitcnt for LDS
14481453
// address space operations.
14491454
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1450-
Position::AFTER, AtomicOrdering::Unordered);
1455+
Position::AFTER, AtomicOrdering::Unordered,
1456+
/*AtomicsOnly=*/false);
14511457

14521458
return Changed;
14531459
}
@@ -1467,8 +1473,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
14671473
SIAtomicScope Scope,
14681474
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
14691475
bool IsCrossAddrSpaceOrdering,
1470-
Position Pos,
1471-
AtomicOrdering Order) const {
1476+
Position Pos, AtomicOrdering Order,
1477+
bool AtomicsOnly) const {
14721478
if (ST.isTgSplitEnabled()) {
14731479
// In threadgroup split mode the waves of a work-group can be executing on
14741480
// different CUs. Therefore need to wait for global or GDS memory operations
@@ -1488,7 +1494,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
14881494
AddrSpace &= ~SIAtomicAddrSpace::LDS;
14891495
}
14901496
return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1491-
IsCrossAddrSpaceOrdering, Pos, Order);
1497+
IsCrossAddrSpaceOrdering, Pos, Order,
1498+
AtomicsOnly);
14921499
}
14931500

14941501
bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -1747,7 +1754,8 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
17471754
// observable outside the program, so no need to cause a waitcnt for LDS
17481755
// address space operations.
17491756
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1750-
Position::AFTER, AtomicOrdering::Unordered);
1757+
Position::AFTER, AtomicOrdering::Unordered,
1758+
/*AtomicsOnly=*/false);
17511759

17521760
return Changed;
17531761
}
@@ -1904,7 +1912,8 @@ bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
19041912
// Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
19051913
// S_WAITCNT needed.
19061914
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1907-
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1915+
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
1916+
/*AtomicsOnly=*/false);
19081917

19091918
return Changed;
19101919
}
@@ -1984,7 +1993,8 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
19841993
// observable outside the program, so no need to cause a waitcnt for LDS
19851994
// address space operations.
19861995
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1987-
Position::AFTER, AtomicOrdering::Unordered);
1996+
Position::AFTER, AtomicOrdering::Unordered,
1997+
/*AtomicsOnly=*/false);
19881998
return Changed;
19891999
}
19902000

@@ -2007,7 +2017,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
20072017
SIAtomicScope Scope,
20082018
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
20092019
bool IsCrossAddrSpaceOrdering,
2010-
Position Pos, AtomicOrdering Order) const {
2020+
Position Pos, AtomicOrdering Order,
2021+
bool AtomicsOnly) const {
20112022
bool Changed = false;
20122023

20132024
MachineBasicBlock &MBB = *MI->getParent();
@@ -2281,7 +2292,8 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
22812292
// observable outside the program, so no need to cause a waitcnt for LDS
22822293
// address space operations.
22832294
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2284-
Position::AFTER, AtomicOrdering::Unordered);
2295+
Position::AFTER, AtomicOrdering::Unordered,
2296+
/*AtomicsOnly=*/false);
22852297
return Changed;
22862298
}
22872299

@@ -2354,7 +2366,8 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
23542366
SIAtomicScope Scope,
23552367
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
23562368
bool IsCrossAddrSpaceOrdering,
2357-
Position Pos, AtomicOrdering Order) const {
2369+
Position Pos, AtomicOrdering Order,
2370+
bool AtomicsOnly) const {
23582371
bool Changed = false;
23592372

23602373
MachineBasicBlock &MBB = *MI->getParent();
@@ -2444,7 +2457,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
24442457
//
24452458
// This also applies to fences. Fences cannot pair with an instruction
24462459
// tracked with bvh/samplecnt as we don't have any atomics that do that.
2447-
if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
2460+
if (!AtomicsOnly && ST.hasImageInsts()) {
24482461
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
24492462
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
24502463
}
@@ -2587,7 +2600,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
25872600
// complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
25882601
// we of course need to wait for that as well.
25892602
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2590-
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
2603+
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
2604+
/*AtomicsOnly=*/false);
25912605

25922606
return Changed;
25932607
}
@@ -2624,7 +2638,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
26242638
// observable outside the program, so no need to cause a waitcnt for LDS
26252639
// address space operations.
26262640
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2627-
Position::AFTER, AtomicOrdering::Unordered);
2641+
Position::AFTER, AtomicOrdering::Unordered,
2642+
/*AtomicsOnly=*/false);
26282643
}
26292644

26302645
return Changed;
@@ -2748,13 +2763,15 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
27482763
Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
27492764
SIMemOp::LOAD | SIMemOp::STORE,
27502765
MOI.getIsCrossAddressSpaceOrdering(),
2751-
Position::BEFORE, Order);
2766+
Position::BEFORE, Order, /*AtomicsOnly=*/false);
27522767

27532768
if (Order == AtomicOrdering::Acquire ||
27542769
Order == AtomicOrdering::SequentiallyConsistent) {
2755-
Changed |= CC->insertWait(
2756-
MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,
2757-
MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
2770+
// The wait below only needs to wait on the prior atomic.
2771+
Changed |=
2772+
CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2773+
SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(),
2774+
Position::AFTER, Order, /*AtomicsOnly=*/true);
27582775
Changed |= CC->insertAcquire(MI, MOI.getScope(),
27592776
MOI.getOrderingAddrSpace(),
27602777
Position::AFTER);
@@ -2830,9 +2847,11 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
28302847
if (MOI.isAtomic()) {
28312848
const AtomicOrdering Order = MOI.getOrdering();
28322849
if (Order == AtomicOrdering::Acquire) {
2833-
Changed |= CC->insertWait(
2834-
MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2835-
MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);
2850+
// Acquire fences only need to wait on the previous atomic they pair with.
2851+
Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
2852+
SIMemOp::LOAD | SIMemOp::STORE,
2853+
MOI.getIsCrossAddressSpaceOrdering(),
2854+
Position::BEFORE, Order, /*AtomicsOnly=*/true);
28362855
}
28372856

28382857
if (Order == AtomicOrdering::Release ||
@@ -2897,10 +2916,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
28972916
Order == AtomicOrdering::SequentiallyConsistent ||
28982917
MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
28992918
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2900-
Changed |= CC->insertWait(
2901-
MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2902-
isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2903-
MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
2919+
// Only wait on the previous atomic.
2920+
Changed |=
2921+
CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2922+
isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2923+
MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER,
2924+
Order, /*AtomicsOnly=*/true);
29042925
Changed |= CC->insertAcquire(MI, MOI.getScope(),
29052926
MOI.getOrderingAddrSpace(),
29062927
Position::AFTER);

0 commit comments

Comments
 (0)