diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 2cddc3365d5d7..c7d515aeb012f 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -537,6 +537,8 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following - Packed work-item Add product IDs names. + - Workgroup + Clusters =========== =============== ============ ===== ================= =============== =============== ====================== @@ -1095,6 +1097,22 @@ is conservatively correct for OpenCL. - ``wavefront`` and executed by a thread in the same wavefront. + ``cluster`` Synchronizes with, and participates in modification + and seq_cst total orderings with, other operations + (except image operations) for all address spaces + (except private, or generic that accesses private) + provided the other operation's sync scope is: + + - ``system``, ``agent`` or ``cluster`` and + executed by a thread on the same cluster. + - ``workgroup`` and executed by a thread in the + same work-group. + - ``wavefront`` and executed by a thread in the + same wavefront. + + On targets that do not support workgroup cluster + launch mode, this behaves like ``agent`` scope instead. + ``workgroup`` Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except image operations) for all address spaces @@ -1128,6 +1146,9 @@ is conservatively correct for OpenCL. ``agent-one-as`` Same as ``agent`` but only synchronizes with other operations within the same address space. + ``cluster-one-as`` Same as ``cluster`` but only synchronizes with other + operations within the same address space. + ``workgroup-one-as`` Same as ``workgroup`` but only synchronizes with other operations within the same address space. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp index eda479064d7b2..d09b7cffe9f29 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp @@ -23,6 +23,7 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI) AgentSSID = CTX.getOrInsertSyncScopeID("agent"); WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup"); WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront"); + ClusterSSID = CTX.getOrInsertSyncScopeID("cluster"); SystemOneAddressSpaceSSID = CTX.getOrInsertSyncScopeID("one-as"); AgentOneAddressSpaceSSID = @@ -33,4 +34,5 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI) CTX.getOrInsertSyncScopeID("wavefront-one-as"); SingleThreadOneAddressSpaceSSID = CTX.getOrInsertSyncScopeID("singlethread-one-as"); + ClusterOneAddressSpaceSSID = CTX.getOrInsertSyncScopeID("cluster-one-as"); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h index fcb0c8cfb7ca6..bf852bb38376e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h @@ -32,6 +32,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF { SyncScope::ID WorkgroupSSID; /// Wavefront synchronization scope ID (cross address space). SyncScope::ID WavefrontSSID; + /// Cluster synchronization scope ID (cross address space). + SyncScope::ID ClusterSSID; /// System synchronization scope ID (single address space). SyncScope::ID SystemOneAddressSpaceSSID; /// Agent synchronization scope ID (single address space). @@ -42,6 +44,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF { SyncScope::ID WavefrontOneAddressSpaceSSID; /// Single thread synchronization scope ID (single address space). SyncScope::ID SingleThreadOneAddressSpaceSSID; + /// Cluster synchronization scope ID (single address space). + SyncScope::ID ClusterOneAddressSpaceSSID; /// In AMDGPU target synchronization scopes are inclusive, meaning a /// larger synchronization scope is inclusive of a smaller synchronization @@ -60,12 +64,15 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF { else if (SSID == getWorkgroupSSID() || SSID == getWorkgroupOneAddressSpaceSSID()) return 2; + else if (SSID == getClusterSSID() || + SSID == getClusterOneAddressSpaceSSID()) + return 3; else if (SSID == getAgentSSID() || SSID == getAgentOneAddressSpaceSSID()) - return 3; + return 4; else if (SSID == SyncScope::System || SSID == getSystemOneAddressSpaceSSID()) - return 4; + return 5; return std::nullopt; } @@ -73,7 +80,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF { /// \returns True if \p SSID is restricted to single address space, false /// otherwise bool isOneAddressSpace(SyncScope::ID SSID) const { - return SSID == getSingleThreadOneAddressSpaceSSID() || + return SSID == getClusterOneAddressSpaceSSID() || + SSID == getSingleThreadOneAddressSpaceSSID() || SSID == getWavefrontOneAddressSpaceSSID() || SSID == getWorkgroupOneAddressSpaceSSID() || SSID == getAgentOneAddressSpaceSSID() || @@ -95,6 +103,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF { SyncScope::ID getWavefrontSSID() const { return WavefrontSSID; } + /// \returns Cluster synchronization scope ID (cross address space). + SyncScope::ID getClusterSSID() const { return ClusterSSID; } /// \returns System synchronization scope ID (single address space). SyncScope::ID getSystemOneAddressSpaceSSID() const { return SystemOneAddressSpaceSSID; @@ -115,6 +125,10 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF { SyncScope::ID getSingleThreadOneAddressSpaceSSID() const { return SingleThreadOneAddressSpaceSSID; } + /// \returns Single thread synchronization scope ID (single address space). + SyncScope::ID getClusterOneAddressSpaceSSID() const { + return ClusterOneAddressSpaceSSID; + } /// In AMDGPU target synchronization scopes are inclusive, meaning a /// larger synchronization scope is inclusive of a smaller synchronization diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index e172a9c699fb1..cbd6f64976d21 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1833,6 +1833,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return GFX1250Insts && getGeneration() == GFX12; } + /// \returns true if the subtarget supports clusters of workgroups. + bool hasClusters() const { return GFX1250Insts; } + /// \returns true if the subtarget requires a wait for xcnt before atomic /// flat/global stores & rmw. bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 0be6a9d09379f..1637c06936f9b 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -63,6 +63,7 @@ enum class SIAtomicScope { SINGLETHREAD, WAVEFRONT, WORKGROUP, + CLUSTER, // Promoted to AGENT on targets without workgroup clusters. AGENT, SYSTEM }; @@ -106,6 +107,7 @@ class SIMemOpInfo final { bool IsCooperative = false; SIMemOpInfo( + const GCNSubtarget &ST, AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, SIAtomicScope Scope = SIAtomicScope::SYSTEM, SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, @@ -156,6 +158,11 @@ class SIMemOpInfo final { SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) { this->Scope = std::min(Scope, SIAtomicScope::AGENT); } + + // On targets that have no concept of a workgroup cluster, use + // AGENT scope as a conservatively correct alternative. + if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters()) + this->Scope = SIAtomicScope::AGENT; } public: @@ -225,6 +232,7 @@ class SIMemOpInfo final { class SIMemOpAccess final { private: const AMDGPUMachineModuleInfo *MMI = nullptr; + const GCNSubtarget &ST; /// Reports unsupported message \p Msg for \p MI to LLVM context. void reportUnsupported(const MachineBasicBlock::iterator &MI, @@ -248,7 +256,7 @@ class SIMemOpAccess final { public: /// Construct class to support accessing the machine memory operands /// of instructions in the machine function \p MF. - SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI); + SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST); /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. std::optional @@ -773,6 +781,8 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true); if (SSID == MMI->getAgentSSID()) return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true); + if (SSID == MMI->getClusterSSID()) + return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true); if (SSID == MMI->getWorkgroupSSID()) return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC, true); @@ -788,6 +798,9 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, if (SSID == MMI->getAgentOneAddressSpaceSSID()) return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); + if (SSID == MMI->getClusterOneAddressSpaceSSID()) + return std::tuple(SIAtomicScope::CLUSTER, + SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false); @@ -815,8 +828,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { return SIAtomicAddrSpace::OTHER; } -SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_) - : MMI(&MMI_) {} +SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_, + const GCNSubtarget &ST) + : MMI(&MMI_), ST(ST) {} std::optional SIMemOpAccess::constructFromMIWithMMO( const MachineBasicBlock::iterator &MI) const { @@ -877,7 +891,7 @@ std::optional SIMemOpAccess::constructFromMIWithMMO( return std::nullopt; } } - return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, + return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, IsNonTemporal, IsLastUse, IsCooperative); } @@ -891,7 +905,7 @@ SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const { // Be conservative if there are no memory operands. if (MI->getNumMemOperands() == 0) - return SIMemOpInfo(); + return SIMemOpInfo(ST); return constructFromMIWithMMO(MI); } @@ -905,7 +919,7 @@ SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const { // Be conservative if there are no memory operands. if (MI->getNumMemOperands() == 0) - return SIMemOpInfo(); + return SIMemOpInfo(ST); return constructFromMIWithMMO(MI); } @@ -946,8 +960,9 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { if (SynchronizeAS) OrderingAddrSpace = *SynchronizeAS; - return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, - IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic); + return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, + SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering, + AtomicOrdering::NotAtomic); } std::optional SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( @@ -959,7 +974,7 @@ std::optional SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( // Be conservative if there are no memory operands. if (MI->getNumMemOperands() == 0) - return SIMemOpInfo(); + return SIMemOpInfo(ST); return constructFromMIWithMMO(MI); } @@ -2377,6 +2392,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: + case SIAtomicScope::CLUSTER: if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2413,6 +2429,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: + case SIAtomicScope::CLUSTER: case SIAtomicScope::WORKGROUP: // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is // not needed as LDS operations for all waves are executed in a total @@ -2495,6 +2512,9 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, case SIAtomicScope::AGENT: ScopeImm = AMDGPU::CPol::SCOPE_DEV; break; + case SIAtomicScope::CLUSTER: + ScopeImm = AMDGPU::CPol::SCOPE_SE; + break; case SIAtomicScope::WORKGROUP: // GFX12.0: // In WGP mode the waves of a work-group can be executing on either CU of @@ -2565,6 +2585,7 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, .addImm(AMDGPU::CPol::SCOPE_DEV); } break; + case SIAtomicScope::CLUSTER: case SIAtomicScope::WORKGROUP: // No WB necessary, but we still have to wait. break; @@ -2649,11 +2670,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { const unsigned Scope = CPol->getImm() & CPol::SCOPE; // GFX12.0 only: Extra waits needed before system scope stores. - if (!ST.hasGFX1250Insts()) { - if (!Atomic && Scope == CPol::SCOPE_SYS) - return insertWaitsBeforeSystemScopeStore(MI); - return Changed; - } + if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS) + Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator()); return Changed; } @@ -2684,6 +2702,9 @@ bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI, case SIAtomicScope::AGENT: Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV); break; + case SIAtomicScope::CLUSTER: + Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE); + break; case SIAtomicScope::WORKGROUP: // In workgroup mode, SCOPE_SE is needed as waves can executes on // different CUs that access different L0s. @@ -2930,8 +2951,8 @@ SIMemoryLegalizerPass::run(MachineFunction &MF, bool SIMemoryLegalizer::run(MachineFunction &MF) { bool Changed = false; - SIMemOpAccess MOA(MMI.getObjFileInfo()); const GCNSubtarget &ST = MF.getSubtarget(); + SIMemOpAccess MOA(MMI.getObjFileInfo(), ST); CC = SICacheControl::create(ST); for (auto &MBB : MF) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index 736a8b58466dd..d288bfc6a09db 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -1632,6 +1632,801 @@ entry: ret void } + +define amdgpu_kernel void @cluster_acquire_fence() { +; GFX6-LABEL: cluster_acquire_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: cluster_acquire_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: cluster_acquire_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: cluster_acquire_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: cluster_acquire_fence: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: cluster_acquire_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: cluster_acquire_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: cluster_acquire_fence: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: cluster_acquire_fence: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: cluster_acquire_fence: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: cluster_acquire_fence: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: cluster_acquire_fence: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: cluster_acquire_fence: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: cluster_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm +entry: + fence syncscope("cluster") acquire + ret void +} + +define amdgpu_kernel void @cluster_release_fence() { +; GFX6-LABEL: cluster_release_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: cluster_release_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: cluster_release_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: cluster_release_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: cluster_release_fence: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: cluster_release_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: cluster_release_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: cluster_release_fence: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: cluster_release_fence: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: cluster_release_fence: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: cluster_release_fence: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: cluster_release_fence: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: cluster_release_fence: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: cluster_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm +entry: + fence syncscope("cluster") release + ret void +} + +define amdgpu_kernel void @cluster_acq_rel_fence() { +; GFX6-LABEL: cluster_acq_rel_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: cluster_acq_rel_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: cluster_acq_rel_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: cluster_acq_rel_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: cluster_acq_rel_fence: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: cluster_acq_rel_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: cluster_acq_rel_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: cluster_acq_rel_fence: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: cluster_acq_rel_fence: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: cluster_acq_rel_fence: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: cluster_acq_rel_fence: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: cluster_acq_rel_fence: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: cluster_acq_rel_fence: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: cluster_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm +entry: + fence syncscope("cluster") acq_rel + ret void +} + +define amdgpu_kernel void @cluster_seq_cst_fence() { +; GFX6-LABEL: cluster_seq_cst_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: cluster_seq_cst_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: cluster_seq_cst_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: cluster_seq_cst_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: cluster_seq_cst_fence: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: cluster_seq_cst_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: cluster_seq_cst_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: cluster_seq_cst_fence: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: cluster_seq_cst_fence: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: cluster_seq_cst_fence: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: cluster_seq_cst_fence: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: cluster_seq_cst_fence: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: cluster_seq_cst_fence: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: cluster_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm +entry: + fence syncscope("cluster") seq_cst + ret void +} + +define amdgpu_kernel void @cluster_one_as_acquire_fence() { +; GFX6-LABEL: cluster_one_as_acquire_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: cluster_one_as_acquire_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: cluster_one_as_acquire_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: cluster_one_as_acquire_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: cluster_one_as_acquire_fence: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: cluster_one_as_acquire_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: cluster_one_as_acquire_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: cluster_one_as_acquire_fence: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: cluster_one_as_acquire_fence: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: cluster_one_as_acquire_fence: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: cluster_one_as_acquire_fence: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: cluster_one_as_acquire_fence: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: cluster_one_as_acquire_fence: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: cluster_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm +entry: + fence syncscope("cluster-one-as") acquire + ret void +} + +define amdgpu_kernel void @cluster_one_as_release_fence() { +; GFX6-LABEL: cluster_one_as_release_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: cluster_one_as_release_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: cluster_one_as_release_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: cluster_one_as_release_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: cluster_one_as_release_fence: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: cluster_one_as_release_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: cluster_one_as_release_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: cluster_one_as_release_fence: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: cluster_one_as_release_fence: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: cluster_one_as_release_fence: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: cluster_one_as_release_fence: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: cluster_one_as_release_fence: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: cluster_one_as_release_fence: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: cluster_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm +entry: + fence syncscope("cluster-one-as") release + ret void +} + +define amdgpu_kernel void @cluster_one_as_acq_rel_fence() { +; GFX6-LABEL: cluster_one_as_acq_rel_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: cluster_one_as_acq_rel_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: cluster_one_as_acq_rel_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: cluster_one_as_acq_rel_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: cluster_one_as_acq_rel_fence: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: cluster_one_as_acq_rel_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: cluster_one_as_acq_rel_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: cluster_one_as_acq_rel_fence: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: cluster_one_as_acq_rel_fence: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: cluster_one_as_acq_rel_fence: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: cluster_one_as_acq_rel_fence: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: cluster_one_as_acq_rel_fence: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: cluster_one_as_acq_rel_fence: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: cluster_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm +entry: + fence syncscope("cluster-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @cluster_one_as_seq_cst_fence() { +; GFX6-LABEL: cluster_one_as_seq_cst_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: cluster_one_as_seq_cst_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: cluster_one_as_seq_cst_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: cluster_one_as_seq_cst_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: cluster_one_as_seq_cst_fence: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: cluster_one_as_seq_cst_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: cluster_one_as_seq_cst_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: cluster_one_as_seq_cst_fence: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: cluster_one_as_seq_cst_fence: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: cluster_one_as_seq_cst_fence: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: cluster_one_as_seq_cst_fence: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: cluster_one_as_seq_cst_fence: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: cluster_one_as_seq_cst_fence: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: cluster_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm +entry: + fence syncscope("cluster-one-as") seq_cst + ret void +} + define amdgpu_kernel void @agent_acquire_fence() { ; GFX6-LABEL: agent_acquire_fence: ; GFX6: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll new file mode 100644 index 0000000000000..9ea9f1125c726 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll @@ -0,0 +1,25726 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX942-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s + +define amdgpu_kernel void @flat_cluster_unordered_load( +; GFX7-LABEL: flat_cluster_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_unordered_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_unordered_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_unordered_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_unordered_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_unordered_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_unordered_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %in, ptr %out) { +entry: + %val = load atomic i32, ptr %in syncscope("cluster") unordered, align 4 + store i32 %val, ptr %out + ret void +} + +define amdgpu_kernel void @flat_cluster_monotonic_load( +; GFX7-LABEL: flat_cluster_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_monotonic_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_monotonic_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_monotonic_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_monotonic_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %in, ptr %out) { +entry: + %val = load atomic i32, ptr %in syncscope("cluster") monotonic, align 4 + store i32 %val, ptr %out + ret void +} + +define amdgpu_kernel void @flat_cluster_acquire_load( +; GFX7-LABEL: flat_cluster_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acquire_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acquire_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acquire_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acquire_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %in, ptr %out) { +entry: + %val = load atomic i32, ptr %in syncscope("cluster") acquire, align 4 + store i32 %val, ptr %out + ret void +} + +define amdgpu_kernel void @flat_cluster_seq_cst_load( +; GFX7-LABEL: flat_cluster_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_seq_cst_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_seq_cst_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_seq_cst_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_seq_cst_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %in, ptr %out) { +entry: + %val = load atomic i32, ptr %in syncscope("cluster") seq_cst, align 4 + store i32 %val, ptr %out + ret void +} + +define amdgpu_kernel void @flat_cluster_unordered_store( +; GFX7-LABEL: flat_cluster_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_unordered_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_unordered_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_unordered_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_unordered_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_unordered_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_unordered_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + i32 %in, ptr %out) { +entry: + store atomic i32 %in, ptr %out syncscope("cluster") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_monotonic_store( +; GFX7-LABEL: flat_cluster_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_monotonic_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_monotonic_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_monotonic_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_monotonic_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr %out) { +entry: + store atomic i32 %in, ptr %out syncscope("cluster") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_release_store( +; GFX7-LABEL: flat_cluster_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_release_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_release_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_release_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_release_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_release_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr %out) { +entry: + store atomic i32 %in, ptr %out syncscope("cluster") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_seq_cst_store( +; GFX7-LABEL: flat_cluster_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_seq_cst_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_seq_cst_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_seq_cst_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_seq_cst_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr %out) { +entry: + store atomic i32 %in, ptr %out syncscope("cluster") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_monotonic_atomicrmw( +; GFX7-LABEL: flat_cluster_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_monotonic_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_monotonic_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_monotonic_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_monotonic_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") monotonic + ret void +} + +define amdgpu_kernel void @flat_cluster_acquire_atomicrmw( +; GFX7-LABEL: flat_cluster_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acquire_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acquire_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acquire_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acquire_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") acquire + ret void +} + +define amdgpu_kernel void @flat_cluster_release_atomicrmw( +; GFX7-LABEL: flat_cluster_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_release_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_release_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_release_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_release_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_release_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") release + ret void +} + +define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw( +; GFX7-LABEL: flat_cluster_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acq_rel_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acq_rel_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acq_rel_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acq_rel_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") acq_rel + ret void +} + +define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw( +; GFX7-LABEL: flat_cluster_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_seq_cst_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_seq_cst_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_seq_cst_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_seq_cst_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") seq_cst + ret void +} + +define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_cluster_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acquire_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acquire_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acquire_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acquire_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") acquire + store i32 %val, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_cluster_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acq_rel_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acq_rel_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acq_rel_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acq_rel_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") acq_rel + store i32 %val, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_cluster_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_seq_cst_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_seq_cst_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_seq_cst_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_seq_cst_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster") seq_cst + store i32 %val, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic + ret void +} + +define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_cluster_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acquire_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acquire_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acquire_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acquire_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic + ret void +} + +define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_cluster_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_release_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_release_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_release_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_release_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_release_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_release_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release monotonic + ret void +} + +define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( +; GFX7-LABEL: flat_cluster_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_monotonic_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_monotonic_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_monotonic_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_monotonic_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire + ret void +} + +define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_cluster_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acquire_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acquire_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acquire_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acquire_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire + ret void +} + +define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( +; GFX7-LABEL: flat_cluster_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_release_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_release_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_release_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_release_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_release_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_release_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release acquire + ret void +} + +define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire + ret void +} + +define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire + ret void +} + +define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( +; GFX7-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst + ret void +} + +define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( +; GFX7-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst + ret void +} + +define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( +; GFX7-LABEL: flat_cluster_release_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_release_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_release_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_release_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_release_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_release_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_release_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_release_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_release_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_release_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst + ret void +} + +define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( +; GFX7-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst + ret void +} + +define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_release_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_release_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_release_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_release_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_release_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_unordered_load( +; GFX7-LABEL: flat_cluster_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_unordered_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_unordered_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_unordered_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_unordered_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_unordered_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_unordered_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %in, ptr %out) { +entry: + %val = load atomic i32, ptr %in syncscope("cluster-one-as") unordered, align 4 + store i32 %val, ptr %out + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_monotonic_load( +; GFX7-LABEL: flat_cluster_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %in, ptr %out) { +entry: + %val = load atomic i32, ptr %in syncscope("cluster-one-as") monotonic, align 4 + store i32 %val, ptr %out + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acquire_load( +; GFX7-LABEL: flat_cluster_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acquire_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acquire_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %in, ptr %out) { +entry: + %val = load atomic i32, ptr %in syncscope("cluster-one-as") acquire, align 4 + store i32 %val, ptr %out + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load( +; GFX7-LABEL: flat_cluster_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %in, ptr %out) { +entry: + %val = load atomic i32, ptr %in syncscope("cluster-one-as") seq_cst, align 4 + store i32 %val, ptr %out + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_unordered_store( +; GFX7-LABEL: flat_cluster_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_unordered_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_unordered_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_unordered_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_unordered_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_unordered_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_unordered_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + i32 %in, ptr %out) { +entry: + store atomic i32 %in, ptr %out syncscope("cluster-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_monotonic_store( +; GFX7-LABEL: flat_cluster_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr %out) { +entry: + store atomic i32 %in, ptr %out syncscope("cluster-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_release_store( +; GFX7-LABEL: flat_cluster_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_release_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_release_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_release_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_release_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr %out) { +entry: + store atomic i32 %in, ptr %out syncscope("cluster-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store( +; GFX7-LABEL: flat_cluster_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr %out) { +entry: + store atomic i32 %in, ptr %out syncscope("cluster-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_monotonic_atomicrmw( +; GFX7-LABEL: flat_cluster_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") monotonic + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw( +; GFX7-LABEL: flat_cluster_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acquire_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acquire_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") acquire + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw( +; GFX7-LABEL: flat_cluster_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_release_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_release_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_release_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_release_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") release + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") acquire + store i32 %val, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") acq_rel + store i32 %val, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("cluster-one-as") seq_cst + store i32 %val, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s4, s8 +; GFX10-WGP-NEXT: s_mov_b32 s5, s9 +; GFX10-WGP-NEXT: s_mov_b32 s9, s10 +; GFX10-WGP-NEXT: s_mov_b32 s8, s11 +; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9 +; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-WGP-NEXT: s_mov_b32 s5, s8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s4, s8 +; GFX10-CU-NEXT: s_mov_b32 s5, s9 +; GFX10-CU-NEXT: s_mov_b32 s9, s10 +; GFX10-CU-NEXT: s_mov_b32 s8, s11 +; GFX10-CU-NEXT: s_add_u32 s4, s4, s9 +; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8 +; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX10-CU-NEXT: s_mov_b32 s5, s8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_mov_b32 s6, s4 +; GFX10-WGP-NEXT: s_mov_b32 s7, s5 +; GFX10-WGP-NEXT: s_mov_b32 s11, s12 +; GFX10-WGP-NEXT: s_mov_b32 s10, s13 +; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11 +; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-WGP-NEXT: s_mov_b32 s7, s10 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 +; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc +; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_mov_b32 s6, s4 +; GFX10-CU-NEXT: s_mov_b32 s7, s5 +; GFX10-CU-NEXT: s_mov_b32 s11, s12 +; GFX10-CU-NEXT: s_mov_b32 s10, s13 +; GFX10-CU-NEXT: s_add_u32 s6, s6, s11 +; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10 +; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX10-CU-NEXT: s_mov_b32 s7, s10 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr %out, i32 4 + %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll new file mode 100644 index 0000000000000..7c0a2ad5cdb78 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll @@ -0,0 +1,23810 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX942-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s + +define amdgpu_kernel void @global_cluster_unordered_load( +; GFX6-LABEL: global_cluster_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, s9 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX6-NEXT: s_mov_b32 s12, 0x100f000 +; GFX6-NEXT: s_mov_b32 s13, -1 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX6-NEXT: s_mov_b32 s9, s6 +; GFX6-NEXT: s_mov_b32 s10, s13 +; GFX6-NEXT: s_mov_b32 s11, s12 +; GFX6-NEXT: s_mov_b32 s14, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s14 +; GFX6-NEXT: s_mov_b32 s6, s13 +; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_unordered_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_unordered_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_unordered_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_unordered_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_unordered_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_unordered_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster") unordered, align 4 + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @global_cluster_monotonic_load( +; GFX6-LABEL: global_cluster_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, s9 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX6-NEXT: s_mov_b32 s12, 0x100f000 +; GFX6-NEXT: s_mov_b32 s13, -1 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX6-NEXT: s_mov_b32 s9, s6 +; GFX6-NEXT: s_mov_b32 s10, s13 +; GFX6-NEXT: s_mov_b32 s11, s12 +; GFX6-NEXT: s_mov_b32 s14, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s14 +; GFX6-NEXT: s_mov_b32 s6, s13 +; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_monotonic_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_monotonic_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_monotonic_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_monotonic_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster") monotonic, align 4 + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @global_cluster_acquire_load( +; GFX6-LABEL: global_cluster_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, s9 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX6-NEXT: s_mov_b32 s12, 0x100f000 +; GFX6-NEXT: s_mov_b32 s13, -1 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX6-NEXT: s_mov_b32 s9, s6 +; GFX6-NEXT: s_mov_b32 s10, s13 +; GFX6-NEXT: s_mov_b32 s11, s12 +; GFX6-NEXT: s_mov_b32 s14, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s14 +; GFX6-NEXT: s_mov_b32 s6, s13 +; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acquire_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acquire_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acquire_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acquire_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acquire_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster") acquire, align 4 + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @global_cluster_seq_cst_load( +; GFX6-LABEL: global_cluster_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, s9 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX6-NEXT: s_mov_b32 s12, 0x100f000 +; GFX6-NEXT: s_mov_b32 s13, -1 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX6-NEXT: s_mov_b32 s9, s6 +; GFX6-NEXT: s_mov_b32 s10, s13 +; GFX6-NEXT: s_mov_b32 s11, s12 +; GFX6-NEXT: s_mov_b32 s14, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s14 +; GFX6-NEXT: s_mov_b32 s6, s13 +; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_seq_cst_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_seq_cst_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_seq_cst_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_seq_cst_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster") seq_cst, align 4 + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @global_cluster_unordered_store( +; GFX6-LABEL: global_cluster_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_unordered_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_unordered_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_unordered_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_unordered_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_unordered_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_unordered_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(1) %out) { +entry: + store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_monotonic_store( +; GFX6-LABEL: global_cluster_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_monotonic_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_monotonic_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_monotonic_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_monotonic_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(1) %out) { +entry: + store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_release_store( +; GFX6-LABEL: global_cluster_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_release_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_release_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_release_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_release_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_release_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(1) %out) { +entry: + store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster") release, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_seq_cst_store( +; GFX6-LABEL: global_cluster_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_seq_cst_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_seq_cst_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_seq_cst_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_seq_cst_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(1) %out) { +entry: + store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_monotonic_atomicrmw( +; GFX6-LABEL: global_cluster_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_monotonic_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_monotonic_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_monotonic_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_monotonic_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") monotonic + ret void +} + +define amdgpu_kernel void @global_cluster_acquire_atomicrmw( +; GFX6-LABEL: global_cluster_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acquire_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acquire_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acquire_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acquire_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acquire_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") acquire + ret void +} + +define amdgpu_kernel void @global_cluster_release_atomicrmw( +; GFX6-LABEL: global_cluster_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_release_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_release_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_release_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_release_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_release_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") release + ret void +} + +define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw( +; GFX6-LABEL: global_cluster_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acq_rel_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acq_rel_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acq_rel_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acq_rel_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") acq_rel + ret void +} + +define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw( +; GFX6-LABEL: global_cluster_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_seq_cst_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_seq_cst_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_seq_cst_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_seq_cst_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") seq_cst + ret void +} + +define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw( +; GFX6-LABEL: global_cluster_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acquire_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acquire_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acquire_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acquire_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acquire_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acquire_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") acquire + store i32 %val, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_cluster_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acq_rel_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acq_rel_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acq_rel_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acq_rel_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") acq_rel + store i32 %val, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_cluster_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_seq_cst_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_seq_cst_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_seq_cst_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_seq_cst_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster") seq_cst + store i32 %val, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_cluster_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_monotonic_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_monotonic_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_monotonic_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_monotonic_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_cluster_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acquire_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acquire_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acquire_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acquire_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acquire_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg( +; GFX6-LABEL: global_cluster_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_release_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_release_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_release_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_release_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_release_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_release_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") release monotonic + ret void +} + +define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg( +; GFX6-LABEL: global_cluster_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_monotonic_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_monotonic_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_monotonic_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_monotonic_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire + ret void +} + +define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_cluster_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acquire_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acquire_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acquire_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acquire_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acquire_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire + ret void +} + +define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg( +; GFX6-LABEL: global_cluster_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_release_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_release_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_release_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_release_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_release_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_release_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") release acquire + ret void +} + +define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_cluster_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acq_rel_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acq_rel_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acq_rel_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acq_rel_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_cluster_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_seq_cst_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_seq_cst_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_seq_cst_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_seq_cst_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst + ret void +} + +define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: global_cluster_acquire_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acquire_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acquire_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acquire_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acquire_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acquire_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acquire_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acquire_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acquire_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acquire_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst + ret void +} + +define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg( +; GFX6-LABEL: global_cluster_release_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_release_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_release_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_release_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_release_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_release_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_release_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_release_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_release_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_release_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_release_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst + ret void +} + +define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst + ret void +} + +define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_cluster_release_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_release_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_release_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_release_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_release_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_release_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_release_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_release_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_release_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_release_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_release_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_cluster_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_release_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_release_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_release_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_release_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_release_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_unordered_load( +; GFX6-LABEL: global_cluster_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, s9 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX6-NEXT: s_mov_b32 s12, 0x100f000 +; GFX6-NEXT: s_mov_b32 s13, -1 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX6-NEXT: s_mov_b32 s9, s6 +; GFX6-NEXT: s_mov_b32 s10, s13 +; GFX6-NEXT: s_mov_b32 s11, s12 +; GFX6-NEXT: s_mov_b32 s14, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s14 +; GFX6-NEXT: s_mov_b32 s6, s13 +; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_unordered_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_unordered_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_unordered_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_unordered_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_unordered_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_unordered_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster-one-as") unordered, align 4 + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_monotonic_load( +; GFX6-LABEL: global_cluster_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, s9 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX6-NEXT: s_mov_b32 s12, 0x100f000 +; GFX6-NEXT: s_mov_b32 s13, -1 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX6-NEXT: s_mov_b32 s9, s6 +; GFX6-NEXT: s_mov_b32 s10, s13 +; GFX6-NEXT: s_mov_b32 s11, s12 +; GFX6-NEXT: s_mov_b32 s14, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s14 +; GFX6-NEXT: s_mov_b32 s6, s13 +; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_monotonic_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_monotonic_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster-one-as") monotonic, align 4 + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acquire_load( +; GFX6-LABEL: global_cluster_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, s9 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX6-NEXT: s_mov_b32 s12, 0x100f000 +; GFX6-NEXT: s_mov_b32 s13, -1 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX6-NEXT: s_mov_b32 s9, s6 +; GFX6-NEXT: s_mov_b32 s10, s13 +; GFX6-NEXT: s_mov_b32 s11, s12 +; GFX6-NEXT: s_mov_b32 s14, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s14 +; GFX6-NEXT: s_mov_b32 s6, s13 +; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acquire_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acquire_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acquire_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acquire_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster-one-as") acquire, align 4 + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_seq_cst_load( +; GFX6-LABEL: global_cluster_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, s9 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 +; GFX6-NEXT: s_mov_b32 s12, 0x100f000 +; GFX6-NEXT: s_mov_b32 s13, -1 +; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX6-NEXT: s_mov_b32 s9, s6 +; GFX6-NEXT: s_mov_b32 s10, s13 +; GFX6-NEXT: s_mov_b32 s11, s12 +; GFX6-NEXT: s_mov_b32 s14, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s14 +; GFX6-NEXT: s_mov_b32 s6, s13 +; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_load_dword v2, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %val = load atomic i32, ptr addrspace(1) %in syncscope("cluster-one-as") seq_cst, align 4 + store i32 %val, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_unordered_store( +; GFX6-LABEL: global_cluster_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_unordered_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_unordered_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_unordered_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_unordered_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_unordered_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_unordered_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(1) %out) { +entry: + store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_monotonic_store( +; GFX6-LABEL: global_cluster_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_monotonic_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_monotonic_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(1) %out) { +entry: + store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_release_store( +; GFX6-LABEL: global_cluster_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_release_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_release_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_release_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_release_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(1) %out) { +entry: + store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_seq_cst_store( +; GFX6-LABEL: global_cluster_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(1) %out) { +entry: + store atomic i32 %in, ptr addrspace(1) %out syncscope("cluster-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw( +; GFX6-LABEL: global_cluster_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_monotonic_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_monotonic_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw( +; GFX6-LABEL: global_cluster_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acquire_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acquire_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acquire_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acquire_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") acquire + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw( +; GFX6-LABEL: global_cluster_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_release_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_release_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_release_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_release_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") release + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: global_cluster_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: global_cluster_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") acquire + store i32 %val, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") acq_rel + store i32 %val, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s9, 0x100f000 +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s11 +; GFX6-NEXT: s_mov_b32 s6, s10 +; GFX6-NEXT: s_mov_b32 s7, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("cluster-one-as") seq_cst + store i32 %val, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_release_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_release_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_release_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_release_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3 +; GFX7-NEXT: s_mov_b64 s[10:11], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s4, s8 +; GFX7-NEXT: s_mov_b32 s5, s9 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: s_mov_b32 s8, s11 +; GFX7-NEXT: s_add_u32 s4, s4, s9 +; GFX7-NEXT: s_addc_u32 s8, s5, s8 +; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX7-NEXT: s_mov_b32 s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s12, s5 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 +; GFX6-NEXT: s_mov_b32 s10, 0x100f000 +; GFX6-NEXT: s_mov_b32 s11, -1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX6-NEXT: s_mov_b32 s5, s12 +; GFX6-NEXT: s_mov_b32 s6, s11 +; GFX6-NEXT: s_mov_b32 s7, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3 +; GFX7-NEXT: s_mov_b64 s[12:13], 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, s4 +; GFX7-NEXT: s_mov_b32 s7, s5 +; GFX7-NEXT: s_mov_b32 s11, s12 +; GFX7-NEXT: s_mov_b32 s10, s13 +; GFX7-NEXT: s_add_u32 s6, s6, s11 +; GFX7-NEXT: s_addc_u32 s10, s7, s10 +; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX7-NEXT: s_mov_b32 s7, s10 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1 +; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc1 +; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc1 +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: buffer_gl0_inv +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_DEV +; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2 +; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: s_wait_storecnt 0x0 +; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + ptr addrspace(1) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll new file mode 100644 index 0000000000000..8926893c68dbc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll @@ -0,0 +1,18467 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX942-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s + +define amdgpu_kernel void @local_cluster_unordered_load( +; GFX6-LABEL: local_cluster_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_unordered_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_unordered_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_unordered_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_unordered_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_unordered_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_unordered_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %in, ptr addrspace(3) %out) { +entry: + %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster") unordered, align 4 + store i32 %val, ptr addrspace(3) %out + ret void +} + +define amdgpu_kernel void @local_cluster_monotonic_load( +; GFX6-LABEL: local_cluster_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_monotonic_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_monotonic_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_monotonic_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_monotonic_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %in, ptr addrspace(3) %out) { +entry: + %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster") monotonic, align 4 + store i32 %val, ptr addrspace(3) %out + ret void +} + +define amdgpu_kernel void @local_cluster_acquire_load( +; GFX6-LABEL: local_cluster_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acquire_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acquire_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acquire_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acquire_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acquire_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %in, ptr addrspace(3) %out) { +entry: + %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster") acquire, align 4 + store i32 %val, ptr addrspace(3) %out + ret void +} + +define amdgpu_kernel void @local_cluster_seq_cst_load( +; GFX6-LABEL: local_cluster_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_seq_cst_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_seq_cst_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_seq_cst_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_seq_cst_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %in, ptr addrspace(3) %out) { +entry: + %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster") seq_cst, align 4 + store i32 %val, ptr addrspace(3) %out + ret void +} + +define amdgpu_kernel void @local_cluster_unordered_store( +; GFX6-LABEL: local_cluster_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_unordered_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_unordered_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_unordered_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_unordered_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_unordered_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_unordered_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(3) %out) { +entry: + store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_monotonic_store( +; GFX6-LABEL: local_cluster_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_monotonic_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_monotonic_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_monotonic_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_monotonic_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(3) %out) { +entry: + store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_release_store( +; GFX6-LABEL: local_cluster_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_release_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_release_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_release_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_release_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_release_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(3) %out) { +entry: + store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster") release, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_seq_cst_store( +; GFX6-LABEL: local_cluster_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_seq_cst_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_seq_cst_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_seq_cst_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_seq_cst_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(3) %out) { +entry: + store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_monotonic_atomicrmw( +; GFX6-LABEL: local_cluster_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_monotonic_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_monotonic_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_monotonic_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_monotonic_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") monotonic + ret void +} + +define amdgpu_kernel void @local_cluster_acquire_atomicrmw( +; GFX6-LABEL: local_cluster_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acquire_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acquire_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acquire_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acquire_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acquire_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") acquire + ret void +} + +define amdgpu_kernel void @local_cluster_release_atomicrmw( +; GFX6-LABEL: local_cluster_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_release_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_release_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_release_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_release_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_release_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") release + ret void +} + +define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw( +; GFX6-LABEL: local_cluster_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acq_rel_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acq_rel_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acq_rel_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acq_rel_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") acq_rel + ret void +} + +define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw( +; GFX6-LABEL: local_cluster_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_seq_cst_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_seq_cst_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_seq_cst_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_seq_cst_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") seq_cst + ret void +} + +define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw( +; GFX6-LABEL: local_cluster_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acquire_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acquire_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acquire_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acquire_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acquire_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acquire_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") acquire + store i32 %val, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_cluster_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acq_rel_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acq_rel_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acq_rel_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acq_rel_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") acq_rel + store i32 %val, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_cluster_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_seq_cst_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_seq_cst_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_seq_cst_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_seq_cst_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster") seq_cst + store i32 %val, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_cluster_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_monotonic_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_monotonic_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_monotonic_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_monotonic_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_cluster_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acquire_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acquire_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acquire_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acquire_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acquire_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg( +; GFX6-LABEL: local_cluster_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_release_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_release_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_release_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_release_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_release_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_release_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") release monotonic + ret void +} + +define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg( +; GFX6-LABEL: local_cluster_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_monotonic_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_monotonic_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_monotonic_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_monotonic_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire + ret void +} + +define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_cluster_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acquire_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acquire_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acquire_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acquire_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acquire_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire + ret void +} + +define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg( +; GFX6-LABEL: local_cluster_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_release_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_release_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_release_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_release_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_release_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_release_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") release acquire + ret void +} + +define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_cluster_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acq_rel_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acq_rel_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acq_rel_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acq_rel_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_cluster_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_seq_cst_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_seq_cst_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_seq_cst_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_seq_cst_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst + ret void +} + +define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: local_cluster_acquire_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acquire_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acquire_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acquire_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acquire_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acquire_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acquire_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acquire_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acquire_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acquire_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst + ret void +} + +define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg( +; GFX6-LABEL: local_cluster_release_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_release_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_release_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_release_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_release_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_release_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_release_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_release_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_release_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_release_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_release_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst + ret void +} + +define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst + ret void +} + +define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_cluster_release_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_release_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_release_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_release_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_release_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_release_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_release_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_release_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_release_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_release_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_release_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_cluster_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_release_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_release_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_release_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_release_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_release_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_unordered_load( +; GFX6-LABEL: local_cluster_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_unordered_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_unordered_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_unordered_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_unordered_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_unordered_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_unordered_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %in, ptr addrspace(3) %out) { +entry: + %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster-one-as") unordered, align 4 + store i32 %val, ptr addrspace(3) %out + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_monotonic_load( +; GFX6-LABEL: local_cluster_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_monotonic_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_monotonic_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %in, ptr addrspace(3) %out) { +entry: + %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster-one-as") monotonic, align 4 + store i32 %val, ptr addrspace(3) %out + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acquire_load( +; GFX6-LABEL: local_cluster_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acquire_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acquire_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acquire_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acquire_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %in, ptr addrspace(3) %out) { +entry: + %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster-one-as") acquire, align 4 + store i32 %val, ptr addrspace(3) %out + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_seq_cst_load( +; GFX6-LABEL: local_cluster_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr4 +; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr5 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: ds_load_b32 v1, v0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: ds_load_b32 v1, v0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %in, ptr addrspace(3) %out) { +entry: + %val = load atomic i32, ptr addrspace(3) %in syncscope("cluster-one-as") seq_cst, align 4 + store i32 %val, ptr addrspace(3) %out + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_unordered_store( +; GFX6-LABEL: local_cluster_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_unordered_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_unordered_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_unordered_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_unordered_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_unordered_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_unordered_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(3) %out) { +entry: + store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_monotonic_store( +; GFX6-LABEL: local_cluster_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_monotonic_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_monotonic_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(3) %out) { +entry: + store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_release_store( +; GFX6-LABEL: local_cluster_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_release_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_release_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_release_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_release_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(3) %out) { +entry: + store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_seq_cst_store( +; GFX6-LABEL: local_cluster_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(3) %out) { +entry: + store atomic i32 %in, ptr addrspace(3) %out syncscope("cluster-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_monotonic_atomicrmw( +; GFX6-LABEL: local_cluster_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_monotonic_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_monotonic_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acquire_atomicrmw( +; GFX6-LABEL: local_cluster_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acquire_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acquire_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acquire_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acquire_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") acquire + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_release_atomicrmw( +; GFX6-LABEL: local_cluster_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_release_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_release_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_release_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_release_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") release + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: local_cluster_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: local_cluster_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") acquire + store i32 %val, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") acq_rel + store i32 %val, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("cluster-one-as") seq_cst + store i32 %val, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_release_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_release_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_release_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_release_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr6 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} + +define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: ; kill: def $sgpr5 killed $sgpr4 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v1 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: ds_store_b32 v0, v1 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v1 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v0, v1 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm + ptr addrspace(3) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(3) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(3) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll new file mode 100644 index 0000000000000..2af195461d2eb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll @@ -0,0 +1,23347 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX942-TGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s + +define amdgpu_kernel void @private_cluster_unordered_load( +; GFX6-LABEL: private_cluster_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_unordered_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_unordered_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_unordered_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_unordered_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_unordered_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_unordered_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %in, ptr addrspace(5) %out) { +entry: + %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster") unordered, align 4 + store i32 %val, ptr addrspace(5) %out + ret void +} + +define amdgpu_kernel void @private_cluster_monotonic_load( +; GFX6-LABEL: private_cluster_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_monotonic_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_monotonic_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_monotonic_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_monotonic_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %in, ptr addrspace(5) %out) { +entry: + %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster") monotonic, align 4 + store i32 %val, ptr addrspace(5) %out + ret void +} + +define amdgpu_kernel void @private_cluster_acquire_load( +; GFX6-LABEL: private_cluster_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acquire_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acquire_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acquire_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acquire_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acquire_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %in, ptr addrspace(5) %out) { +entry: + %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster") acquire, align 4 + store i32 %val, ptr addrspace(5) %out + ret void +} + +define amdgpu_kernel void @private_cluster_seq_cst_load( +; GFX6-LABEL: private_cluster_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_seq_cst_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_seq_cst_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_seq_cst_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_seq_cst_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %in, ptr addrspace(5) %out) { +entry: + %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster") seq_cst, align 4 + store i32 %val, ptr addrspace(5) %out + ret void +} + +define amdgpu_kernel void @private_cluster_unordered_store( +; GFX6-LABEL: private_cluster_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_unordered_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_unordered_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_unordered_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_unordered_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_unordered_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_unordered_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v[0:1], v2 +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(5) %out) { +entry: + store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster") unordered, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_monotonic_store( +; GFX6-LABEL: private_cluster_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_monotonic_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_monotonic_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_monotonic_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_monotonic_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(5) %out) { +entry: + store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_release_store( +; GFX6-LABEL: private_cluster_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_release_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_release_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_release_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_release_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_release_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(5) %out) { +entry: + store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster") release, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_seq_cst_store( +; GFX6-LABEL: private_cluster_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_seq_cst_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_seq_cst_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_seq_cst_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_seq_cst_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(5) %out) { +entry: + store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_monotonic_atomicrmw( +; GFX6-LABEL: private_cluster_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_monotonic_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_monotonic_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_monotonic_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_monotonic_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") monotonic + ret void +} + +define amdgpu_kernel void @private_cluster_acquire_atomicrmw( +; GFX6-LABEL: private_cluster_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acquire_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acquire_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acquire_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acquire_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acquire_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") acquire + ret void +} + +define amdgpu_kernel void @private_cluster_release_atomicrmw( +; GFX6-LABEL: private_cluster_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_release_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_release_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_release_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_release_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_release_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") release + ret void +} + +define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw( +; GFX6-LABEL: private_cluster_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acq_rel_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acq_rel_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acq_rel_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acq_rel_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") acq_rel + ret void +} + +define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw( +; GFX6-LABEL: private_cluster_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_seq_cst_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_seq_cst_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_seq_cst_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_seq_cst_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") seq_cst + ret void +} + +define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw( +; GFX6-LABEL: private_cluster_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acquire_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acquire_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acquire_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acquire_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acquire_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acquire_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") acquire + store i32 %val, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw( +; GFX6-LABEL: private_cluster_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acq_rel_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acq_rel_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acq_rel_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acq_rel_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") acq_rel + store i32 %val, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw( +; GFX6-LABEL: private_cluster_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_seq_cst_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_seq_cst_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_seq_cst_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_seq_cst_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster") seq_cst + store i32 %val, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: private_cluster_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_monotonic_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_monotonic_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_monotonic_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_monotonic_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic + ret void +} + +define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg( +; GFX6-LABEL: private_cluster_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acquire_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acquire_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acquire_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acquire_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acquire_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic + ret void +} + +define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg( +; GFX6-LABEL: private_cluster_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_release_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_release_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_release_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_release_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_release_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_release_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") release monotonic + ret void +} + +define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg( +; GFX6-LABEL: private_cluster_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_monotonic_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_monotonic_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_monotonic_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_monotonic_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire + ret void +} + +define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg( +; GFX6-LABEL: private_cluster_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acquire_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acquire_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acquire_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acquire_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acquire_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire + ret void +} + +define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg( +; GFX6-LABEL: private_cluster_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_release_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_release_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_release_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_release_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_release_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_release_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") release acquire + ret void +} + +define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: private_cluster_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acq_rel_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acq_rel_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acq_rel_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acq_rel_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire + ret void +} + +define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: private_cluster_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_seq_cst_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_seq_cst_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_seq_cst_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_seq_cst_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire + ret void +} + +define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst + ret void +} + +define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: private_cluster_acquire_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acquire_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acquire_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acquire_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acquire_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acquire_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acquire_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acquire_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acquire_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acquire_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst + ret void +} + +define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg( +; GFX6-LABEL: private_cluster_release_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_release_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_release_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_release_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_release_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_release_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_release_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_release_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_release_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_release_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_release_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst + ret void +} + +define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst + ret void +} + +define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: private_cluster_release_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_release_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_release_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_release_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_release_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_release_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_release_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_release_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_release_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_release_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_release_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg( +; GFX6-LABEL: private_cluster_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_release_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_release_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_release_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_release_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_release_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_unordered_load( +; GFX6-LABEL: private_cluster_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_unordered_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_unordered_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_unordered_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_unordered_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_unordered_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_unordered_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %in, ptr addrspace(5) %out) { +entry: + %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster-one-as") unordered, align 4 + store i32 %val, ptr addrspace(5) %out + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_monotonic_load( +; GFX6-LABEL: private_cluster_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_monotonic_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_monotonic_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %in, ptr addrspace(5) %out) { +entry: + %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster-one-as") monotonic, align 4 + store i32 %val, ptr addrspace(5) %out + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acquire_load( +; GFX6-LABEL: private_cluster_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acquire_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acquire_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acquire_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acquire_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %in, ptr addrspace(5) %out) { +entry: + %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster-one-as") acquire, align 4 + store i32 %val, ptr addrspace(5) %out + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_seq_cst_load( +; GFX6-LABEL: private_cluster_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_load: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_load: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_load: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_load: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_load: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_load: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %in, ptr addrspace(5) %out) { +entry: + %val = load atomic i32, ptr addrspace(5) %in syncscope("cluster-one-as") seq_cst, align 4 + store i32 %val, ptr addrspace(5) %out + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_unordered_store( +; GFX6-LABEL: private_cluster_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_unordered_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_unordered_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_unordered_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_unordered_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_unordered_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_unordered_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v[0:1], v2 +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(5) %out) { +entry: + store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_monotonic_store( +; GFX6-LABEL: private_cluster_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_monotonic_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_monotonic_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(5) %out) { +entry: + store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_release_store( +; GFX6-LABEL: private_cluster_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_release_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_release_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_release_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_release_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(5) %out) { +entry: + store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_seq_cst_store( +; GFX6-LABEL: private_cluster_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_store: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_store: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_store: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_store: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_store: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_store: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + i32 %in, ptr addrspace(5) %out) { +entry: + store atomic i32 %in, ptr addrspace(5) %out syncscope("cluster-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw( +; GFX6-LABEL: private_cluster_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_monotonic_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_monotonic_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") monotonic + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw( +; GFX6-LABEL: private_cluster_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acquire_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acquire_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acquire_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acquire_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") acquire + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw( +; GFX6-LABEL: private_cluster_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_release_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_release_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_release_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_release_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") release + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: private_cluster_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: private_cluster_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1 +; GFX1250-NEXT: s_mov_b32 s1, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s1, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, s3 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s1, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") acquire + store i32 %val, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") acq_rel + store i32 %val, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1) +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s0, s3 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("cluster-one-as") seq_cst + store i32 %val, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_release_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_release_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_release_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_release_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX6-NEXT: s_mov_b32 s7, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s4, s4, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1 +; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 +; GFX7-NEXT: s_mov_b32 s7, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s3, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s3, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s0, s0, s3 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s3, s2, s3 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[4:5], 0 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s3, s6 +; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_mov_b32_e32 v4, s0 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +} + +define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_add_u32 s0, s0, s15 +; GFX6-NEXT: s_addc_u32 s1, s1, 0 +; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX6-NEXT: s_mov_b32 s5, 16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_add_i32 s5, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_u32 s0, s0, s17 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2 +; GFX7-NEXT: s_mov_b32 s5, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_i32 s5, s4, s5 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6 +; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5] +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-NOTTGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NOTTGSPLIT-NEXT: s_nop 0 +; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8 +; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1 +; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-TGSPLIT-NEXT: s_nop 0 +; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1 +; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX11-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 16 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1 +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: s_endpgm +; +; GFX11-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 16 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_add_i32 s1, s0, s1 +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3 +; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3 +; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: s_endpgm +; +; GFX12-WGP-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-WGP-NEXT: s_wait_alu 0xf1ff +; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-WGP-NEXT: s_endpgm +; +; GFX12-CU-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2 +; GFX12-CU-NEXT: s_wait_alu 0xf1ff +; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2 +; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16 +; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_mov_b32 s3, 16 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_add_co_i32 s4, s0, s3 +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3 +; GFX1250-NEXT: s_mov_b32 s3, 20 +; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo +; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7] +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b64 s[6:7], 0 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s5, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s4, s5 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: s_mov_b32 s3, s6 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4 +; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v4, s1 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 +; GFX1250-NEXT: s_endpgm + ptr addrspace(5) %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, ptr addrspace(5) %out, i32 4 + %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, ptr addrspace(5) %out, align 4 + ret void +}