Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 54 additions & 33 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -360,11 +360,13 @@ class SICacheControl {
/// between memory instructions to enforce the order they become visible as
/// observed by other memory instructions executing in memory scope \p Scope.
/// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
/// address spaces. Returns true iff any instructions inserted.
/// address spaces. If \p AtomicsOnly is true, only insert waits for counters
/// that are used by atomic instructions.
/// Returns true iff any instructions inserted.
virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
AtomicOrdering Order) const = 0;
AtomicOrdering Order, bool AtomicsOnly) const = 0;

/// Inserts any necessary instructions at position \p Pos relative to
/// instruction \p MI to ensure any subsequent memory instructions of this
Expand Down Expand Up @@ -437,7 +439,7 @@ class SIGfx6CacheControl : public SICacheControl {
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
AtomicOrdering Order) const override;
AtomicOrdering Order, bool AtomicsOnly) const override;

bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
Expand Down Expand Up @@ -484,7 +486,7 @@ class SIGfx90ACacheControl : public SIGfx7CacheControl {
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
AtomicOrdering Order) const override;
AtomicOrdering Order, bool AtomicsOnly) const override;

bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
Expand Down Expand Up @@ -572,7 +574,7 @@ class SIGfx10CacheControl : public SIGfx7CacheControl {
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
AtomicOrdering Order) const override;
AtomicOrdering Order, bool AtomicsOnly) const override;

bool insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
Expand Down Expand Up @@ -629,7 +631,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
AtomicOrdering Order) const override;
AtomicOrdering Order, bool AtomicsOnly) const override;

bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
Expand Down Expand Up @@ -1120,7 +1122,8 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
Position::AFTER, AtomicOrdering::Unordered);
Position::AFTER, AtomicOrdering::Unordered,
/*AtomicsOnly=*/false);

return Changed;
}
Expand All @@ -1140,7 +1143,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering, Position Pos,
AtomicOrdering Order) const {
AtomicOrdering Order,
bool AtomicsOnly) const {
bool Changed = false;

MachineBasicBlock &MBB = *MI->getParent();
Expand Down Expand Up @@ -1294,7 +1298,8 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
bool IsCrossAddrSpaceOrdering,
Position Pos) const {
return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
/*AtomicsOnly=*/false);
}

bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
Expand Down Expand Up @@ -1447,7 +1452,8 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
Position::AFTER, AtomicOrdering::Unordered);
Position::AFTER, AtomicOrdering::Unordered,
/*AtomicsOnly=*/false);

return Changed;
}
Expand All @@ -1467,8 +1473,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
Position Pos,
AtomicOrdering Order) const {
Position Pos, AtomicOrdering Order,
bool AtomicsOnly) const {
if (ST.isTgSplitEnabled()) {
// In threadgroup split mode the waves of a work-group can be executing on
// different CUs. Therefore need to wait for global or GDS memory operations
Expand All @@ -1488,7 +1494,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
AddrSpace &= ~SIAtomicAddrSpace::LDS;
}
return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
IsCrossAddrSpaceOrdering, Pos, Order);
IsCrossAddrSpaceOrdering, Pos, Order,
AtomicsOnly);
}

bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
Expand Down Expand Up @@ -1747,7 +1754,8 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
Position::AFTER, AtomicOrdering::Unordered);
Position::AFTER, AtomicOrdering::Unordered,
/*AtomicsOnly=*/false);

return Changed;
}
Expand Down Expand Up @@ -1904,7 +1912,8 @@ bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
// Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
// S_WAITCNT needed.
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
/*AtomicsOnly=*/false);

return Changed;
}
Expand Down Expand Up @@ -1984,7 +1993,8 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
Position::AFTER, AtomicOrdering::Unordered);
Position::AFTER, AtomicOrdering::Unordered,
/*AtomicsOnly=*/false);
return Changed;
}

Expand All @@ -2007,7 +2017,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
Position Pos, AtomicOrdering Order) const {
Position Pos, AtomicOrdering Order,
bool AtomicsOnly) const {
bool Changed = false;

MachineBasicBlock &MBB = *MI->getParent();
Expand Down Expand Up @@ -2281,7 +2292,8 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
Position::AFTER, AtomicOrdering::Unordered);
Position::AFTER, AtomicOrdering::Unordered,
/*AtomicsOnly=*/false);
return Changed;
}

Expand Down Expand Up @@ -2354,7 +2366,8 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsCrossAddrSpaceOrdering,
Position Pos, AtomicOrdering Order) const {
Position Pos, AtomicOrdering Order,
bool AtomicsOnly) const {
bool Changed = false;

MachineBasicBlock &MBB = *MI->getParent();
Expand Down Expand Up @@ -2444,7 +2457,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
//
// This also applies to fences. Fences cannot pair with an instruction
// tracked with bvh/samplecnt as we don't have any atomics that do that.
if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
if (!AtomicsOnly && ST.hasImageInsts()) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
}
Expand Down Expand Up @@ -2587,7 +2600,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
// complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
// we of course need to wait for that as well.
Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
/*AtomicsOnly=*/false);

return Changed;
}
Expand Down Expand Up @@ -2624,7 +2638,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
Position::AFTER, AtomicOrdering::Unordered);
Position::AFTER, AtomicOrdering::Unordered,
/*AtomicsOnly=*/false);
}

return Changed;
Expand Down Expand Up @@ -2748,13 +2763,15 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
SIMemOp::LOAD | SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE, Order);
Position::BEFORE, Order, /*AtomicsOnly=*/false);

if (Order == AtomicOrdering::Acquire ||
Order == AtomicOrdering::SequentiallyConsistent) {
Changed |= CC->insertWait(
MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,
MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
// The wait below only needs to wait on the prior atomic.
Changed |=
CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(),
Position::AFTER, Order, /*AtomicsOnly=*/true);
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
Expand Down Expand Up @@ -2830,9 +2847,11 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
if (MOI.isAtomic()) {
const AtomicOrdering Order = MOI.getOrdering();
if (Order == AtomicOrdering::Acquire) {
Changed |= CC->insertWait(
MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);
// Acquire fences only need to wait on the previous atomic they pair with.
Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
SIMemOp::LOAD | SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE, Order, /*AtomicsOnly=*/true);
}

if (Order == AtomicOrdering::Release ||
Expand Down Expand Up @@ -2897,10 +2916,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
Order == AtomicOrdering::SequentiallyConsistent ||
MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
Changed |= CC->insertWait(
MI, MOI.getScope(), MOI.getInstrAddrSpace(),
isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
// Only wait on the previous atomic.
Changed |=
CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER,
Order, /*AtomicsOnly=*/true);
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
Expand Down
Loading
Loading