Skip to content

Commit 20795e0

Browse files
authored
[AMDGPU][SIMemoryLegalizer] Combine GFX10-11 CacheControl Classes (#168058)
Also breaks the long inheritance chains by making both `SIGfx10CacheControl` and `SIGfx12CacheControl` inherit from `SICacheControl` directly. With this patch, we now just have 3 `SICacheControl` implementations that each do their own thing, and there is no more code hidden 3 superclasses above (which made this code harder to read and maintain than it needed to be).
1 parent ee1abb8 commit 20795e0

File tree

1 file changed

+38
-120
lines changed

1 file changed

+38
-120
lines changed

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 38 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ class SICacheControl {
404404

405405
/// Generates code sequences for the memory model of all GFX targets below
406406
/// GFX10.
407-
class SIGfx6CacheControl : public SICacheControl {
407+
class SIGfx6CacheControl final : public SICacheControl {
408408
public:
409409

410410
SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
@@ -443,14 +443,27 @@ class SIGfx6CacheControl : public SICacheControl {
443443
Position Pos) const override;
444444
};
445445

446-
class SIGfx10CacheControl : public SIGfx6CacheControl {
446+
/// Generates code sequences for the memory model of GFX10/11.
447+
class SIGfx10CacheControl final : public SICacheControl {
447448
public:
448-
SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
449+
SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
449450

450451
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
451452
SIAtomicScope Scope,
452453
SIAtomicAddrSpace AddrSpace) const override;
453454

455+
bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
456+
SIAtomicScope Scope,
457+
SIAtomicAddrSpace AddrSpace) const override {
458+
return false;
459+
}
460+
461+
bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
462+
SIAtomicScope Scope,
463+
SIAtomicAddrSpace AddrSpace) const override {
464+
return false;
465+
}
466+
454467
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
455468
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
456469
bool IsVolatile, bool IsNonTemporal,
@@ -463,23 +476,17 @@ class SIGfx10CacheControl : public SIGfx6CacheControl {
463476

464477
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
465478
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
466-
};
467-
468-
class SIGfx11CacheControl : public SIGfx10CacheControl {
469-
public:
470-
SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
471479

472-
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
473-
SIAtomicScope Scope,
474-
SIAtomicAddrSpace AddrSpace) const override;
475-
476-
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
477-
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
478-
bool IsVolatile, bool IsNonTemporal,
479-
bool IsLastUse) const override;
480+
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
481+
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
482+
Position Pos) const override {
483+
return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
484+
IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
485+
/*AtomicsOnly=*/false);
486+
}
480487
};
481488

482-
class SIGfx12CacheControl : public SIGfx11CacheControl {
489+
class SIGfx12CacheControl final : public SICacheControl {
483490
protected:
484491
// Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
485492
// \returns Returns true if \p MI is modified, false otherwise.
@@ -504,7 +511,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
504511
SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
505512

506513
public:
507-
SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
514+
SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
508515
// GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
509516
// the behavior is the same if assuming GFX12.0 in CU mode.
510517
assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
@@ -915,10 +922,8 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
915922
GCNSubtarget::Generation Generation = ST.getGeneration();
916923
if (Generation < AMDGPUSubtarget::GFX10)
917924
return std::make_unique<SIGfx6CacheControl>(ST);
918-
if (Generation < AMDGPUSubtarget::GFX11)
919-
return std::make_unique<SIGfx10CacheControl>(ST);
920925
if (Generation < AMDGPUSubtarget::GFX12)
921-
return std::make_unique<SIGfx11CacheControl>(ST);
926+
return std::make_unique<SIGfx10CacheControl>(ST);
922927
return std::make_unique<SIGfx12CacheControl>(ST);
923928
}
924929

@@ -1438,8 +1443,7 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
14381443
}
14391444

14401445
bool SIGfx10CacheControl::enableLoadCacheBypass(
1441-
const MachineBasicBlock::iterator &MI,
1442-
SIAtomicScope Scope,
1446+
const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
14431447
SIAtomicAddrSpace AddrSpace) const {
14441448
assert(MI->mayLoad() && !MI->mayStore());
14451449
bool Changed = false;
@@ -1450,7 +1454,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
14501454
case SIAtomicScope::AGENT:
14511455
// Set the L0 and L1 cache policies to MISS_EVICT.
14521456
// Note: there is no L2 cache coherent bypass control at the ISA level.
1453-
Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
1457+
// For GFX10, set GLC+DLC, for GFX11, only set GLC.
1458+
Changed |=
1459+
enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0));
14541460
break;
14551461
case SIAtomicScope::WORKGROUP:
14561462
// In WGP mode the waves of a work-group can be executing on either CU of
@@ -1504,6 +1510,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
15041510
Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
15051511
}
15061512

1513+
// GFX11: Set MALL NOALLOC for both load and store instructions.
1514+
if (AMDGPU::isGFX11(ST))
1515+
Changed |= enableCPolBits(MI, CPol::DLC);
1516+
15071517
// Ensure operation has completed at system scope to cause all volatile
15081518
// operations to be visible outside the program in a global order. Do not
15091519
// request cross address space as only the global address space can be
@@ -1524,6 +1534,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
15241534
Changed |= enableCPolBits(MI, CPol::GLC);
15251535
Changed |= enableCPolBits(MI, CPol::SLC);
15261536

1537+
// GFX11: Set MALL NOALLOC for both load and store instructions.
1538+
if (AMDGPU::isGFX11(ST))
1539+
Changed |= enableCPolBits(MI, CPol::DLC);
1540+
15271541
return Changed;
15281542
}
15291543

@@ -1722,102 +1736,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
17221736
return Changed;
17231737
}
17241738

1725-
bool SIGfx11CacheControl::enableLoadCacheBypass(
1726-
const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1727-
SIAtomicAddrSpace AddrSpace) const {
1728-
assert(MI->mayLoad() && !MI->mayStore());
1729-
bool Changed = false;
1730-
1731-
if (canAffectGlobalAddrSpace(AddrSpace)) {
1732-
switch (Scope) {
1733-
case SIAtomicScope::SYSTEM:
1734-
case SIAtomicScope::AGENT:
1735-
// Set the L0 and L1 cache policies to MISS_EVICT.
1736-
// Note: there is no L2 cache coherent bypass control at the ISA level.
1737-
Changed |= enableCPolBits(MI, CPol::GLC);
1738-
break;
1739-
case SIAtomicScope::WORKGROUP:
1740-
// In WGP mode the waves of a work-group can be executing on either CU of
1741-
// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1742-
// CU mode all waves of a work-group are on the same CU, and so the L0
1743-
// does not need to be bypassed.
1744-
if (!ST.isCuModeEnabled())
1745-
Changed |= enableCPolBits(MI, CPol::GLC);
1746-
break;
1747-
case SIAtomicScope::WAVEFRONT:
1748-
case SIAtomicScope::SINGLETHREAD:
1749-
// No cache to bypass.
1750-
break;
1751-
default:
1752-
llvm_unreachable("Unsupported synchronization scope");
1753-
}
1754-
}
1755-
1756-
/// The scratch address space does not need the global memory caches
1757-
/// to be bypassed as all memory operations by the same thread are
1758-
/// sequentially consistent, and no other thread can access scratch
1759-
/// memory.
1760-
1761-
/// Other address spaces do not have a cache.
1762-
1763-
return Changed;
1764-
}
1765-
1766-
bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
1767-
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1768-
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1769-
1770-
// Only handle load and store, not atomic read-modify-write insructions. The
1771-
// latter use glc to indicate if the atomic returns a result and so must not
1772-
// be used for cache control.
1773-
assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1774-
1775-
// Only update load and store, not LLVM IR atomic read-modify-write
1776-
// instructions. The latter are always marked as volatile so cannot sensibly
1777-
// handle it as do not want to pessimize all atomics. Also they do not support
1778-
// the nontemporal attribute.
1779-
assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1780-
1781-
bool Changed = false;
1782-
1783-
if (IsVolatile) {
1784-
// Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1785-
// and MISS_LRU for store instructions.
1786-
// Note: there is no L2 cache coherent bypass control at the ISA level.
1787-
if (Op == SIMemOp::LOAD)
1788-
Changed |= enableCPolBits(MI, CPol::GLC);
1789-
1790-
// Set MALL NOALLOC for load and store instructions.
1791-
Changed |= enableCPolBits(MI, CPol::DLC);
1792-
1793-
// Ensure operation has completed at system scope to cause all volatile
1794-
// operations to be visible outside the program in a global order. Do not
1795-
// request cross address space as only the global address space can be
1796-
// observable outside the program, so no need to cause a waitcnt for LDS
1797-
// address space operations.
1798-
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1799-
Position::AFTER, AtomicOrdering::Unordered,
1800-
/*AtomicsOnly=*/false);
1801-
return Changed;
1802-
}
1803-
1804-
if (IsNonTemporal) {
1805-
// For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1806-
// and L2 cache policy to STREAM.
1807-
// For stores setting both GLC and SLC configures L0 and L1 cache policy
1808-
// to MISS_EVICT and the L2 cache policy to STREAM.
1809-
if (Op == SIMemOp::STORE)
1810-
Changed |= enableCPolBits(MI, CPol::GLC);
1811-
Changed |= enableCPolBits(MI, CPol::SLC);
1812-
1813-
// Set MALL NOALLOC for load and store instructions.
1814-
Changed |= enableCPolBits(MI, CPol::DLC);
1815-
return Changed;
1816-
}
1817-
1818-
return Changed;
1819-
}
1820-
18211739
bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
18221740
AMDGPU::CPol::CPol Value) const {
18231741
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);

0 commit comments

Comments
 (0)