@@ -404,7 +404,7 @@ class SICacheControl {
404404
405405// / Generates code sequences for the memory model of all GFX targets below
406406// / GFX10.
407- class SIGfx6CacheControl : public SICacheControl {
407+ class SIGfx6CacheControl final : public SICacheControl {
408408public:
409409
410410 SIGfx6CacheControl (const GCNSubtarget &ST) : SICacheControl(ST) {}
@@ -443,14 +443,27 @@ class SIGfx6CacheControl : public SICacheControl {
443443 Position Pos) const override ;
444444};
445445
446- class SIGfx10CacheControl : public SIGfx6CacheControl {
446+ // / Generates code sequences for the memory model of GFX10/11.
447+ class SIGfx10CacheControl final : public SICacheControl {
447448public:
448- SIGfx10CacheControl (const GCNSubtarget &ST) : SIGfx6CacheControl (ST) {}
449+ SIGfx10CacheControl (const GCNSubtarget &ST) : SICacheControl (ST) {}
449450
450451 bool enableLoadCacheBypass (const MachineBasicBlock::iterator &MI,
451452 SIAtomicScope Scope,
452453 SIAtomicAddrSpace AddrSpace) const override ;
453454
455+ bool enableStoreCacheBypass (const MachineBasicBlock::iterator &MI,
456+ SIAtomicScope Scope,
457+ SIAtomicAddrSpace AddrSpace) const override {
458+ return false ;
459+ }
460+
461+ bool enableRMWCacheBypass (const MachineBasicBlock::iterator &MI,
462+ SIAtomicScope Scope,
463+ SIAtomicAddrSpace AddrSpace) const override {
464+ return false ;
465+ }
466+
454467 bool enableVolatileAndOrNonTemporal (MachineBasicBlock::iterator &MI,
455468 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
456469 bool IsVolatile, bool IsNonTemporal,
@@ -463,23 +476,17 @@ class SIGfx10CacheControl : public SIGfx6CacheControl {
463476
464477 bool insertAcquire (MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
465478 SIAtomicAddrSpace AddrSpace, Position Pos) const override ;
466- };
467-
468- class SIGfx11CacheControl : public SIGfx10CacheControl {
469- public:
470- SIGfx11CacheControl (const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
471479
472- bool enableLoadCacheBypass (const MachineBasicBlock::iterator &MI,
473- SIAtomicScope Scope,
474- SIAtomicAddrSpace AddrSpace) const override ;
475-
476- bool enableVolatileAndOrNonTemporal (MachineBasicBlock::iterator &MI,
477- SIAtomicAddrSpace AddrSpace, SIMemOp Op,
478- bool IsVolatile, bool IsNonTemporal,
479- bool IsLastUse) const override ;
480+ bool insertRelease (MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
481+ SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
482+ Position Pos) const override {
483+ return insertWait (MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
484+ IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
485+ /* AtomicsOnly=*/ false );
486+ }
480487};
481488
482- class SIGfx12CacheControl : public SIGfx11CacheControl {
489+ class SIGfx12CacheControl final : public SICacheControl {
483490protected:
484491 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
485492 // \returns Returns true if \p MI is modified, false otherwise.
@@ -504,7 +511,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
504511 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const ;
505512
506513public:
507- SIGfx12CacheControl (const GCNSubtarget &ST) : SIGfx11CacheControl (ST) {
514+ SIGfx12CacheControl (const GCNSubtarget &ST) : SICacheControl (ST) {
508515 // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
509516 // the behavior is the same if assuming GFX12.0 in CU mode.
510517 assert (!ST.hasGFX1250Insts () || ST.isCuModeEnabled ());
@@ -915,10 +922,8 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
915922 GCNSubtarget::Generation Generation = ST.getGeneration ();
916923 if (Generation < AMDGPUSubtarget::GFX10)
917924 return std::make_unique<SIGfx6CacheControl>(ST);
918- if (Generation < AMDGPUSubtarget::GFX11)
919- return std::make_unique<SIGfx10CacheControl>(ST);
920925 if (Generation < AMDGPUSubtarget::GFX12)
921- return std::make_unique<SIGfx11CacheControl >(ST);
926+ return std::make_unique<SIGfx10CacheControl >(ST);
922927 return std::make_unique<SIGfx12CacheControl>(ST);
923928}
924929
@@ -1438,8 +1443,7 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
14381443}
14391444
14401445bool SIGfx10CacheControl::enableLoadCacheBypass (
1441- const MachineBasicBlock::iterator &MI,
1442- SIAtomicScope Scope,
1446+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
14431447 SIAtomicAddrSpace AddrSpace) const {
14441448 assert (MI->mayLoad () && !MI->mayStore ());
14451449 bool Changed = false ;
@@ -1450,7 +1454,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
14501454 case SIAtomicScope::AGENT:
14511455 // Set the L0 and L1 cache policies to MISS_EVICT.
14521456 // Note: there is no L2 cache coherent bypass control at the ISA level.
1453- Changed |= enableCPolBits (MI, CPol::GLC | CPol::DLC);
1457+ // For GFX10, set GLC+DLC, for GFX11, only set GLC.
1458+ Changed |=
1459+ enableCPolBits (MI, CPol::GLC | (AMDGPU::isGFX10 (ST) ? CPol::DLC : 0 ));
14541460 break ;
14551461 case SIAtomicScope::WORKGROUP:
14561462 // In WGP mode the waves of a work-group can be executing on either CU of
@@ -1504,6 +1510,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
15041510 Changed |= enableCPolBits (MI, CPol::GLC | CPol::DLC);
15051511 }
15061512
1513+ // GFX11: Set MALL NOALLOC for both load and store instructions.
1514+ if (AMDGPU::isGFX11 (ST))
1515+ Changed |= enableCPolBits (MI, CPol::DLC);
1516+
15071517 // Ensure operation has completed at system scope to cause all volatile
15081518 // operations to be visible outside the program in a global order. Do not
15091519 // request cross address space as only the global address space can be
@@ -1524,6 +1534,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
15241534 Changed |= enableCPolBits (MI, CPol::GLC);
15251535 Changed |= enableCPolBits (MI, CPol::SLC);
15261536
1537+ // GFX11: Set MALL NOALLOC for both load and store instructions.
1538+ if (AMDGPU::isGFX11 (ST))
1539+ Changed |= enableCPolBits (MI, CPol::DLC);
1540+
15271541 return Changed;
15281542 }
15291543
@@ -1722,102 +1736,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
17221736 return Changed;
17231737}
17241738
1725- bool SIGfx11CacheControl::enableLoadCacheBypass (
1726- const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1727- SIAtomicAddrSpace AddrSpace) const {
1728- assert (MI->mayLoad () && !MI->mayStore ());
1729- bool Changed = false ;
1730-
1731- if (canAffectGlobalAddrSpace (AddrSpace)) {
1732- switch (Scope) {
1733- case SIAtomicScope::SYSTEM:
1734- case SIAtomicScope::AGENT:
1735- // Set the L0 and L1 cache policies to MISS_EVICT.
1736- // Note: there is no L2 cache coherent bypass control at the ISA level.
1737- Changed |= enableCPolBits (MI, CPol::GLC);
1738- break ;
1739- case SIAtomicScope::WORKGROUP:
1740- // In WGP mode the waves of a work-group can be executing on either CU of
1741- // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1742- // CU mode all waves of a work-group are on the same CU, and so the L0
1743- // does not need to be bypassed.
1744- if (!ST.isCuModeEnabled ())
1745- Changed |= enableCPolBits (MI, CPol::GLC);
1746- break ;
1747- case SIAtomicScope::WAVEFRONT:
1748- case SIAtomicScope::SINGLETHREAD:
1749- // No cache to bypass.
1750- break ;
1751- default :
1752- llvm_unreachable (" Unsupported synchronization scope" );
1753- }
1754- }
1755-
1756- // / The scratch address space does not need the global memory caches
1757- // / to be bypassed as all memory operations by the same thread are
1758- // / sequentially consistent, and no other thread can access scratch
1759- // / memory.
1760-
1761- // / Other address spaces do not have a cache.
1762-
1763- return Changed;
1764- }
1765-
1766- bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal (
1767- MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1768- bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false ) const {
1769-
1770- // Only handle load and store, not atomic read-modify-write insructions. The
1771- // latter use glc to indicate if the atomic returns a result and so must not
1772- // be used for cache control.
1773- assert ((MI->mayLoad () ^ MI->mayStore ()) || SIInstrInfo::isLDSDMA (*MI));
1774-
1775- // Only update load and store, not LLVM IR atomic read-modify-write
1776- // instructions. The latter are always marked as volatile so cannot sensibly
1777- // handle it as do not want to pessimize all atomics. Also they do not support
1778- // the nontemporal attribute.
1779- assert (Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1780-
1781- bool Changed = false ;
1782-
1783- if (IsVolatile) {
1784- // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1785- // and MISS_LRU for store instructions.
1786- // Note: there is no L2 cache coherent bypass control at the ISA level.
1787- if (Op == SIMemOp::LOAD)
1788- Changed |= enableCPolBits (MI, CPol::GLC);
1789-
1790- // Set MALL NOALLOC for load and store instructions.
1791- Changed |= enableCPolBits (MI, CPol::DLC);
1792-
1793- // Ensure operation has completed at system scope to cause all volatile
1794- // operations to be visible outside the program in a global order. Do not
1795- // request cross address space as only the global address space can be
1796- // observable outside the program, so no need to cause a waitcnt for LDS
1797- // address space operations.
1798- Changed |= insertWait (MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false ,
1799- Position::AFTER, AtomicOrdering::Unordered,
1800- /* AtomicsOnly=*/ false );
1801- return Changed;
1802- }
1803-
1804- if (IsNonTemporal) {
1805- // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1806- // and L2 cache policy to STREAM.
1807- // For stores setting both GLC and SLC configures L0 and L1 cache policy
1808- // to MISS_EVICT and the L2 cache policy to STREAM.
1809- if (Op == SIMemOp::STORE)
1810- Changed |= enableCPolBits (MI, CPol::GLC);
1811- Changed |= enableCPolBits (MI, CPol::SLC);
1812-
1813- // Set MALL NOALLOC for load and store instructions.
1814- Changed |= enableCPolBits (MI, CPol::DLC);
1815- return Changed;
1816- }
1817-
1818- return Changed;
1819- }
1820-
18211739bool SIGfx12CacheControl::setTH (const MachineBasicBlock::iterator MI,
18221740 AMDGPU::CPol::CPol Value) const {
18231741 MachineOperand *CPol = TII->getNamedOperand (*MI, OpName::cpol);
0 commit comments